Abstract This document contains the code used to reproduce the results of the article Estimating Bilateral Migration Flows: A Novel Optimal Transport-Based Approach. It also enables users to compute various bilateral migration flow estimates between country pairs from 1990 to 2024, including those presented by Abel and Cohen (2019). The analyses are based on the most recent United Nations population stock data (1990–2024), with specific adjustments applied at various stages of the estimation process. In addition to the core estimation routines, the document includes the code used to generate all figures and results presented in the article, as well as supplementary visualizations not featured in the published version.
Prerequisites
Packages needed :
# Core data manipulation and visualization
library(tidyverse) # Collection of packages for data science (ggplot2, dplyr, etc.)
library(scales) # Functions to scale data for visualization (e.g., axis formatting)
library(viridis) # Colorblind-friendly palettes for ggplot2
library(colorspace) # Advanced color manipulation and palettes
# Migration estimation and related utilities
library(migest) # Tools for working with migration flow data (Abel & Cohen)
library(Barycenter) # Optimal transport algorithms for flow estimation
# Country names and visual representation
library(countrycode) # Convert between country codes and names (e.g., ISO3 to full name)
library(ggflags) # Add country flags to ggplot2 visualizations
# Web and spatial data
library(rvest) # Web scraping: import tables and content from HTML pages
library(sf) # Simple Features for working with spatial/vector data in RNotes on Package Installation
# ---------------------------------------------------
# 📦 Install ggflags package (for country flags)
# Note: ggflags is hosted on an alternative CRAN-like repo
install.packages(
"ggflags",
repos = c("https://jimjam-slam.r-universe.dev", "https://cloud.r-project.org")
)
# ---------------------------------------------------
# 📦 Install Barycenter package (for optimal transport computations)
# ⚠️ Barycenter requires Fortran, especially on Mac
# If you're on macOS, follow these setup steps BEFORE installing:
# 1. Open Terminal and run:
# brew install gcc
# 2. Configure Fortran paths for R:
# mkdir -p ~/.R
# nano ~/.R/Makevars
# Then paste the following into the Makevars file:
# FC = /opt/homebrew/bin/gfortran
# F77 = /opt/homebrew/bin/gfortran
# FLIBS = -L/opt/homebrew/opt/gcc/lib/gcc/current -lgfortran -lquadmath -lm
# 3. (Optional but recommended)
# export LDFLAGS="-L/opt/homebrew/opt/gcc/lib/gcc/current"
# export CPPFLAGS="-I/opt/homebrew/opt/gcc/include"
# 4. Now install the archived package in R:
devtools::install_url('http://cran.r-project.org/src/contrib/Archive/Barycenter/Barycenter_1.3.tar.gz')Functions used to make estimation:
# function that uses Jefferson method to allocate migrants according to proportion
jefferson <- function(my_vect, n_seats) {
n <- length(my_vect)
quotient <- sum(my_vect) / n_seats
if(quotient != 0) {
# step 1
my_seats <- my_vect %/% quotient
n_seats <- n_seats - sum(my_seats)
# step 2
reste <- abs(my_vect %% quotient)
temp <- rev(order(reste))[1:n_seats]
my_seats[temp] <- my_seats[temp] + 1
} else {
my_seats <- numeric(n)
}
return(my_seats)
}
# function that computes drop and reverse negative methods
my_diff <- function(m1, m2, type = "drop") {
stopifnot(type %in% c("drop", "reverse"))
diff <- m2 - m1
# drop
hat_drop <- diff
hat_drop[hat_drop < 0] <- 0
diag(hat_drop) <- 0
if(type == "drop") {
return(hat_drop)
} else {
# reverse
hat_reverse <- t(diff)
hat_reverse[hat_reverse > 0] <- 0
hat_reverse <- hat_drop - hat_reverse
diag(hat_reverse) <- 0
return(hat_reverse)
}
}
# function that computes minimisation method (extract from ffs_demo())
my_min <- function(m1, m2) {
n <- nrow(m1)
my_names <- rownames(m1)
# minimisation based on Open
d0 <- expand.grid(a = 1:n, b = 1:n)
diag_count <- array(0, c(n, n, n))
diag_count <- with(data = d0, expr = replace(x = diag_count,
list = cbind(a, a, b), values = apply(X = cbind(c(t(m1)),
c(t(m2))), MARGIN = 1, FUN = min)))
diag_zero <- diag_count + 1
diag_zero <- with(data = d0, expr = replace(x = diag_zero,
list = cbind(a, a, b), values = 0))
x0 <- t(m1) - apply(X = diag_count, MARGIN = c(1, 3),
FUN = sum)
x1 <- t(m2) - apply(X = diag_count, MARGIN = c(1, 3),
FUN = sum)
x1 <- mipfp::Ipfp(seed = x1, target.list = list(2), target.data = list(colSums(x0)),
tol = 1, iter = 1e+05, tol.margins = 1)$x.hat
a0 <- mipfp::Ipfp(seed = diag_zero, target.list = list(c(1, 3), c(2, 3)),
target.data = list(x0, x1), tol = 1, iter = 1e+05,
tol.margins = 1)
f0 <- a0$x.hat + diag_count
res <- round(migest::sum_od(f0))[-(n+1), -(n+1)]
rownames(res) <- my_names
colnames(res) <- my_names
return(res)
}
# Optimal transport
my_transport <- function(stock_0, stock_1, birth, residence, hat_outflows,
hat_inflows, rate_returns) {
# initialization
my_names <- unique(birth)
n <- length(my_names)
# results
res <- matrix(0, n, n)
outwards <- numeric(n^2)
returns <- numeric(n^2)
transit <- numeric(n^2)
# rule to compute stayers
Z1 <- stock_0 - hat_outflows
Z2 <- stock_1 - hat_inflows
Z <- ifelse(Z1 <= pmin(stock_0, stock_1), Z1, Z2)
for(k in 1:n) {
# country k
ind_birth <- which(birth == my_names[k])
x0_k <- stock_0[ind_birth]
x1_k <- stock_1[ind_birth]
# stayers in foreign countries
Z_k <- Z[ind_birth]
ind_res <- which(residence[ind_birth] == my_names[k])
tot_moov <- sum((x0_k - Z_k)[-ind_res])
# stayers in country k
Z_k[ind_res] <- x1_k[ind_res] - tot_moov * rate_returns
# mass at t0 and t1
mass_0 <- x0_k - Z_k
mass_1 <- x1_k - Z_k
# correct potentiel negative values
mass_0[mass_0 < 0] <- 0
# normalize
t_mass_0 <- sum(mass_0)
t_mass_1 <- sum(mass_1)
if(t_mass_0 != t_mass_1) {
if(t_mass_0 > t_mass_1)
mass_0 <- mass_0 * t_mass_1 / t_mass_0
else
mass_1 <- mass_1 * t_mass_0 / t_mass_1
}
my_cost <- matrix(ifelse(x0_k == 0, 1, 1/x0_k), n, n, byrow = T)
diag(my_cost) <- 10^9
# check
for(i in 1:n) {
sum_ot <- sum(mass_1[-i])
if(mass_0[i] > sum_ot) {
mass_1[i] <- mass_1[i] - (mass_0[i] - sum_ot)
if(mass_1[i] < 0)
cat("negative value")
mass_0[i] <- sum_ot
}
}
temp <- Barycenter::Sinkhorn(matrix(mass_0, ncol = 1),
matrix(mass_1, ncol = 1),
my_cost, lambda = 1, maxIter = 10000,
tolerance=10^(-3))$Transportplan
#
if(is.nan(temp[1, 1])) {
cat(k, "\n")
temp <- transport::transport(mass_0, mass_1,
costm = my_cost,
fullreturn = T,
method = "networkflow")$primal
}
# total
res <- res + temp
# outward
temp_1 <- temp
temp_1[!((1:n) == k), ] = 0
temp_1[k, k] = 0
outwards <- outwards + as.vector(temp_1)
# return
temp_1 <- temp
temp_1[, !((1:n) == k)] = 0
temp_1[k, k] = 0
returns <- returns + as.vector(temp_1)
# transit
temp_1 <- temp
temp_1[, k] = 0
temp_1[k, ] = 0
transit <- transit + as.vector(temp_1)
}
rownames(res) <- my_names
colnames(res) <- my_names
return(list(res = res,
outwards = outwards,
returns = returns,
transit = transit))
}
# Compute Independance
bayes <- function(m1, m2) {
n <- nrow(m1)
my_names <- rownames(m1)
chi2 <- matrix(0, n, n)
for (k in 1:n) {
temp1 <- matrix(m1[k, ], n, n)
temp2 <- matrix(m2[k, ], n, n, byrow = T)
chi2 <- chi2 + temp1 * temp2 / sum(m1[k, ])
}
rownames(chi2) <- my_names
colnames(chi2) <- my_names
return(round(chi2))
}1 Import and manage the data
This section focuses on importing and cleaning the various datasets required for our analysis. We also include several visualization tools to facilitate a better understanding of the data.
1.1 Table of countries
To define the regions of interest, we rely on standardized country codes—either ISO3 or M49. While the United Nations primarily uses the M49 classification, ISO3 codes are widely used in other datasets and are often more convenient for merging or labeling purposes. To accommodate both formats, we maintain a reference table that includes:
- The country name
- Its corresponding ISO3 code
- Its M49 code
This ensures compatibility across various sources and facilitates data manipulation throughout the project.
1.2 International Migrant Stock 2024
We use the International Migrant Stock 2024 dataset, available from the United Nations Department of Economic and Social Affairs at https://www.un.org/development/desa/pd/content/international-migrant-stock. After importing the data, we standardize the variable names for consistency and clarity. We then merge the dataset with our reference table containing ISO3 and M49 country codes, along with standardized country names. This step ensures compatibility across datasets and allows for seamless integration in subsequent analyses.
# The years of interest:
years <- c("1990", "1995", "2000", "2005", "2010", "2015", "2020", "2024")
un <- readxl::read_xlsx("data/undesa_pd_2024_ims_stock_by_sex_destination_and_origin.xlsx",
sheet = "Table 1", skip = 10)
un <- un %>%
dplyr::select(c(2, 5, 6:31)) %>%
rename(dest = 1, dest_code = 2, birth = 3, birth_code = 4) %>%
filter(!(dest_code %in% c(663, 652, 535, 336)))
names(un)[5:31] <- c(paste0("tot_", years),
paste0("m_", years),
paste0("f_", years))Note: Saint Barthélemy and the French part of Saint Martin are excluded from the analysis due to the absence of recorded foreign-born population data. For similar reasons, Vatican City and Bonaire, Sint Eustatius and Saba are also excluded.
The International Migrant Stock (IMS) dataset provides, for each reference year and destination country, the number of international migrants disaggregated by country of birth.
un_code <- un %>%
filter(birth_code %in% unique(un$birth_code[un$birth_code < 900]))
un_code <- merge(un, country[, c("iso", "code")], by.x = "dest_code", by.y = "code")
un_code <- un_code %>%
dplyr::select(-dest_code) %>%
rename(dest_code = iso) %>%
filter(dest_code %in% country$iso)
country <- country %>%
filter(iso %in% unique(un_code$dest_code))
n <- nrow(country)Finally, we have 230 countries in our dataset.
Note: For all destination countries, some migrants
may have an unknown country of origin (recorded as "Others"
in the dataset). We retain this information, as it will be used later
during demographic adjustments to ensure consistency and completeness of
the migration flow estimates.
In previous versions of the database, several countries—namely
Austria, China, Taiwan Province of China, France, Greece, Ireland,
Libya, Saint Barthélemy, and Saint Martin—did not include an
"Others" category. Furthermore, the "Others"
category across all countries was generally underestimated compared to
the 2024 version of the database. For this reason, we conducted a
detailed statistical analysis focusing on migrants with unknown or
unspecified origins to better understand and account for this
discrepancy.
Count of Unknown Migrants: Among countries with more than 1,000,000 migrants whose country of birth is unknown, Germany stands out, peaking at nearly 4,000,000 in 2005. The United States follows with a peak of almost 5,000,000 in 2020, and the United Kingdom with nearly 3,000,000 in 2024. Notably, previous versions of the dataset reported substantially fewer unknown cases—for example, in 2005, Germany recorded only about 150,000 migrants with unknown origins, representing a difference of approximately 3,850,000 compared to the current data.
This discrepancy can be explained by two main factors. First, compared to the previous dataset, the total recorded foreign population has increased. For example, in 2005, the total number of migrants in Germany rose from 9,400,000 to 11,200,000, an increase of 1,800,000. The second reason likely relates to changes in methodology. It appears that in the previous version, some migrants were assigned a country of birth that is now categorized as “Unknown.” For instance, in Germany in 2005, there were around 5,600,000 migrants from Europe, whereas the updated dataset reports only 4,500,000. In other words, migration data is now available for fewer specific countries, with more cases being grouped under the “Unknown” category.
Shares of unknown: Across all the data, the percentage of “Unknwon” equals 8.38\(%\) of the total number of migrants, which is quite significant. We represent them across the years using a parallel boxplot:
# The share of Other
temp_3 <- temp_1[, 4:11] / temp_2[, 4:11]
par(las = 1)
boxplot(temp_3, names = years, main = "Shares")
abline(h = 0.5, lty = 2)
In particular, the number of ‘Unknown’ sometimes accounts for more than
50% of migrants in certain destination countries. These countries
are:
## [1] "Micronesia (Fed. States of)" "Saint Kitts and Nevis"
## [3] "Viet Nam" "South Sudan"
## [5] "Syrian Arab Republic" "Tonga"
## [7] "Tokelau*" "Somalia"
## [9] "Angola" "Equatorial Guinea"
We add the code of ISO
un_code <- merge(un_code, country[, c("iso", "code")], by.x = "birth_code",
by.y = "code")
un_code <- un_code %>%
dplyr::select(-birth_code) %>%
rename(birth_code = iso) %>%
filter(birth_code %in% country$iso) %>%
arrange(dest)Finally, using the IMS database, we count the total number of migrants, including those from both known and unknown countries of birth. The data is then grouped by the observation period.
un_foreign <- un %>%
filter(birth == "World")
temp_names <- c("LocID", "tot_foreign_t0", "tot_foreign_t1", "m_foreign_t0",
"m_foreign_t1", "f_foreign_t0", "f_foreign_t1")
type_sexe <- c("tot_", "tot_", "m_", "m_", "f_", "f_")
un_foreign_t1 <- un_foreign[, c("dest_code", paste0(type_sexe, c(1990, 1995)))]
names(un_foreign_t1) <- temp_names
un_foreign_t1$period <- "1990-1995"
un_foreign_t2 <- un_foreign[, c("dest_code", paste0(type_sexe, c(1995, 2000)))]
names(un_foreign_t2) <- temp_names
un_foreign_t2$period <- "1995-2000"
un_foreign_t3 <- un_foreign[, c("dest_code", paste0(type_sexe, c(2000, 2005)))]
names(un_foreign_t3) <- temp_names
un_foreign_t3$period <- "2000-2005"
un_foreign_t4 <- un_foreign[, c("dest_code", paste0(type_sexe, c(2005, 2010)))]
names(un_foreign_t4) <- temp_names
un_foreign_t4$period <- "2005-2010"
un_foreign_t5 <- un_foreign[, c("dest_code", paste0(type_sexe, c(2010, 2015)))]
names(un_foreign_t5) <- temp_names
un_foreign_t5$period <- "2010-2015"
un_foreign_t6 <- un_foreign[, c("dest_code", paste0(type_sexe, c(2015, 2020)))]
names(un_foreign_t6) <- temp_names
un_foreign_t6$period <- "2015-2020"
un_foreign_t7 <- un_foreign[, c("dest_code", paste0(type_sexe, c(2020, 2024)))]
names(un_foreign_t7) <- temp_names
un_foreign_t7$period <- "2020-2024"
un_foreign <- rbind(un_foreign_t1, un_foreign_t2, un_foreign_t3, un_foreign_t4,
un_foreign_t5, un_foreign_t6, un_foreign_t7)1.3 WPP 2024
We use the WPP 2024 dataset (available at https://population.un.org/wpp/Download/Standard/CSV/) to obtain data on population, deaths, and births.
wpp2024 <- readxl::read_xlsx("data/WPP2024_GEN_F01_DEMOGRAPHIC_INDICATORS_FULL.xlsx",
sheet = "Estimates", skip = 16)
projection <- readxl::read_xlsx("data/WPP2024_GEN_F01_DEMOGRAPHIC_INDICATORS_FULL.xlsx",
sheet = "Medium variant", skip = 16)
wpp2024 <- rbind(wpp2024, projection)
birth <- wpp2024 %>%
dplyr::select(3, 5, 11, 24, 30) %>%
rename(Location = 1, LocID = 2, Time = 3, Births = 4, Ratio = 5) %>%
mutate(Births = as.numeric(Births),
Ratio = as.numeric(Ratio),
RatioMale = Ratio / (Ratio + 100),
RatioFemale = 100 / (Ratio + 100),
BirthsMale = Births * RatioMale,
BirthsFemale = Births * RatioFemale)
death <- wpp2024 %>%
dplyr::select(3, 5, 11, 31, 32, 33) %>%
rename(Location = 1, LocID = 2, Time = 3, DeathTotal = 4, DeathMale = 5, DeathFemale = 6) %>%
mutate(DeathTotal = as.numeric(DeathTotal),
DeathMale = as.numeric(DeathMale),
DeathFemale = as.numeric(DeathFemale))
pop <- wpp2024 %>%
dplyr::select(3, 5, 11, 13, 14, 15) %>%
rename(Location = 1, LocID = 2, Time = 3, PopTotal = 4, PopMale = 5, PopFemale = 6) %>%
mutate(PopTotal = as.numeric(PopTotal),
PopMale = as.numeric(PopMale),
PopFemale = as.numeric(PopFemale))We retain data for the years of interest. Births are aggregated into 5-year groups, while deaths are aggregated into 5-year groups and further categorized by sex. Moreover, birth and death data are provided from January 1st to December 31st. Since migrant stock data are available as of July 1st for the years 1990, 1995, 2000, 2005, 2010, 2015, 2020, and 2024, we divide the corresponding birth and death figures for these years by two.
pop <- pop %>%
filter(Time %in% c(1990, 1995, 2000, 2005, 2010, 2015, 2020, 2024))
# create groups of five years
birth_period <- data.frame(period = character(),
LoID = numeric(),
Location = numeric(),
tot_birth = numeric())
death_period <- data.frame(period = character(),
LoID = numeric(),
Location = numeric(),
tot_death = numeric(),
m_death = numeric(),
f_death = numeric())
for(k in 1:(length(years) - 1)) {
begin_year <- as.numeric(years[k])
end_year <- as.numeric(years[k+1])
birth_period <- rbind(birth_period, birth %>%
filter(Time %in% begin_year:end_year) %>%
mutate(Births = ifelse(Time %in% c(begin_year, end_year), Births / 2, Births),
BirthsMale = ifelse(Time %in% c(begin_year, end_year),
BirthsMale / 2, BirthsMale),
BirthsFemale = ifelse(Time %in% c(begin_year, end_year),
BirthsFemale / 2, BirthsFemale),
period = paste0(begin_year, "-", end_year)) %>%
group_by(period, LocID, Location) %>%
summarise(tot_birth = sum(Births),
m_birth = sum(BirthsMale),
f_birth = sum(BirthsFemale)))
death_period <- rbind(death_period, death %>%
filter(Time %in% begin_year:end_year) %>%
mutate(DeathTotal = ifelse(Time %in% c(begin_year, end_year), DeathTotal / 2, DeathTotal),
DeathMale = ifelse(Time %in% c(begin_year, end_year), DeathMale / 2, DeathMale),
DeathFemale = ifelse(Time %in% c(begin_year, end_year), DeathFemale / 2, DeathFemale),
period = paste0(begin_year, "-", end_year)) %>%
group_by(period, LocID, Location) %>%
summarise(tot_death = sum(DeathTotal),
m_death = sum(DeathMale),
f_death = sum(DeathFemale)
))
}
demo <- merge(birth_period, death_period, by = c("period", "LocID", "Location"))
demo <- demo %>%
filter(period != "Others")
for(k in 1:nrow(demo)) {
t0 <- as.numeric(substr(demo$period[k], 1, 4))
t1 <- t0 + ifelse(t0 == 2020, 4, 5)
my_loc <- demo$LocID[k]
demo[k, "tot_pop_t0"] <- pop[pop$LocID == my_loc & pop$Time == t0, "PopTotal"]
demo[k, "tot_pop_t1"] <- pop[pop$LocID == my_loc & pop$Time == t1, "PopTotal"]
demo[k, "m_pop_t0"] <- pop[pop$LocID == my_loc & pop$Time == t0, "PopMale"]
demo[k, "m_pop_t1"] <- pop[pop$LocID == my_loc & pop$Time == t1, "PopMale"]
demo[k, "f_pop_t0"] <- pop[pop$LocID == my_loc & pop$Time == t0, "PopFemale"]
demo[k, "f_pop_t1"] <- pop[pop$LocID == my_loc & pop$Time == t1, "PopFemale"]
}
demo$tot_birth <- round(demo$tot_birth * 10^3)
demo$m_birth <- round(demo$m_birth* 10^3) # round(demo$tot_birth / 2)
demo$f_birth <- round(demo$f_birth* 10^3) # round(demo$tot_birth / 2)
demo$tot_death <- round(demo$tot_death * 10^3)
demo$m_death <- round(demo$m_death * 10^3)
demo$f_death <- round(demo$f_death * 10^3)
demo$tot_pop_t0 <- round(demo$tot_pop_t0 * 10^3)
demo$tot_pop_t1 <- round(demo$tot_pop_t1 * 10^3)
demo$m_pop_t0 <- round(demo$m_pop_t0 * 10^3)
demo$m_pop_t1 <- round(demo$m_pop_t1 * 10^3)
demo$f_pop_t0 <- round(demo$f_pop_t0 * 10^3)
demo$f_pop_t1 <- round(demo$f_pop_t1 * 10^3)In the population data, the Channel Islands do not appear as a single entity because they are divided into Guernsey and Jersey. Therefore, we aggregate these two parts into one.
Guernsey <- demo %>%
filter(Location == "Guernsey")
Jersey <- demo %>%
filter(Location == "Jersey")
channel <- Guernsey
channel$LocID <- 830
channel$Location <- "Channel Islands"
channel[, 4:13] <- Guernsey[, 4:13] + Jersey[, 4:13]
demo <- demo %>%
filter(LocID %in% country$code) %>%
rbind(channel)We obtain a single table that, for each country and each 5-year period, provides information on births, deaths, and population at \(t_0\) and \(t_1\) categorized by sex.
## period LocID Location tot_birth m_birth f_birth tot_death m_death f_death
## 1 1990-1995 100 Bulgaria 439712 226097 213615 554219 306129 248089
## 2 1990-1995 104 Myanmar 5417066 2793917 2623150 2256714 1224789 1031925
## 3 1990-1995 108 Burundi 1291897 653455 638442 579007 299690 279318
## 4 1990-1995 112 Belarus 591556 305011 286545 598033 306501 291531
## 5 1990-1995 116 Cambodia 1848323 945592 902731 496028 257752 238276
## 6 1990-1995 12 Algeria 3804354 1941301 1863053 781737 422074 359663
## tot_pop_t0 tot_pop_t1 m_pop_t0 m_pop_t1 f_pop_t0 f_pop_t1
## 1 8822365 8357574 4349118 4093443 4473247 4264131
## 2 39817251 42605338 19967989 21351197 19849262 21254140
## 3 5587052 6066316 2750135 2987231 2836918 3079085
## 4 10186261 10197463 4776793 4772021 5409467 5425443
## 5 7374752 10018497 3520593 4824125 3854160 5194372
## 6 25375810 28470191 13078798 14674879 12297012 13795311
We now incorporate migration data:
By subtracting the number of migrants from the total population, we determine the number of natives in each country, which corresponds to the diagonal elements of the stock table. This calculation is performed for each sex and period.
# we compute the number of native
demo$tot_native_t0 <- demo$tot_pop_t0 - demo$tot_foreign_t0
demo$tot_native_t1 <- demo$tot_pop_t1 - demo$tot_foreign_t1
demo$m_native_t0 <- demo$m_pop_t0 - demo$m_foreign_t0
demo$m_native_t1 <- demo$m_pop_t1 - demo$m_foreign_t1
demo$f_native_t0 <- demo$f_pop_t0 - demo$f_foreign_t0
demo$f_native_t1 <- demo$f_pop_t1 - demo$f_foreign_t1
# we compute the Net : (pop(t1) - pop(t0)) - (birth - death)
demo$tot_net <- (demo$tot_pop_t1 - demo$tot_pop_t0) - (demo$tot_birth - demo$tot_death)
demo$m_net <- (demo$m_pop_t1 - demo$m_pop_t0) - (demo$m_birth - demo$m_death)
demo$f_net <- (demo$f_pop_t1 - demo$f_pop_t0) - (demo$f_birth - demo$f_death)2 Some statistics on the orginal stock table
Most of the results presented in this section were not included in the article; they were used primarily to gain a better understanding of the migrant stocks data.
2.1 Available data
Let’s run some statistics to identify which countries host the most different countries of birth.
count_pairs <- data.frame(country = country$name,
as_dest = integer(length(country$iso)),
as_origin = integer(length(country$iso)),
resident = numeric(length(country$iso)),
abroad = numeric(length(country$iso)))
for(k in 1:length(country$iso)) {
count_pairs$as_dest[k] <- nrow(un_code[un_code$dest_code == country$iso[k], ])
count_pairs$as_origin[k] <- nrow(un_code[un_code$birth_code == country$iso[k], ])
count_pairs$resident[k] <- sum(un_code[un_code$dest_code == country$iso[k], "f_2024"])
count_pairs$abroad[k] <- sum(un_code[un_code$birth_code == country$iso[k], "f_2024"])
}For a given destination country, the average number of countries of birth is around 41. We present the number of countries of birth for each destination country. It appears that Norway, Denmark, Greece, and Australia have a high diversity of birth countries. In contrast, Monaco and Saint-Helena have very few.
dotchart(count_pairs$as_dest[order(count_pairs$as_dest)],
labels = count_pairs$country[order(count_pairs$as_dest)])We now focus on the most represented countries of birth across destination countries worldwide. The most frequently represented countries are the United States and China, while the least represented are Mayotte or Saint-Pierre and Miquelon.
dotchart(count_pairs$as_origin[order(count_pairs$as_origin)],
labels = count_pairs$country[order(count_pairs$as_origin)])2.2 Cumulative stocks
We now visualize the countries with the largest numbers of emigrants and immigrants. In 2024, India and China stand out as the top two countries with the highest numbers of migrants living abroad. Among the top 20 countries of origin, many are affected by ongoing conflicts (Ukraine, Russia, Palestine), recent conflicts (Syria, Afghanistan), or economic crises (Venezuela). The United States emerges as the leading destination for migrants, followed by Germany and Canada.
count_pairs$iso2 <- countrycode(count_pairs$country, "country.name", "iso2c")
count_pairs$continent <- countrycode(count_pairs$iso2, "iso2c", "continent")
count_pairs_pivot <- count_pairs %>%
mutate(iso2 = tolower(iso2),
total = resident + abroad) %>%
mutate(country = fct_reorder(substr(country, 1, 12), total)) %>%
arrange(-abroad) %>%
pivot_longer(cols = 4:5, names_to = "Migrants", values_to = "Values")p1 <- count_pairs %>%
mutate(iso2 = tolower(iso2)) %>%
mutate(country = fct_reorder(substr(country, 1, 12), abroad)) %>%
arrange(-abroad) %>%
head(n = 20) %>%
ggplot(aes(abroad, country, fill = continent)) +
geom_col() +
geom_flag(x = 0, aes(country = iso2), size = 8) +
labs(y = "Country", x = "Emmigrants",
title = "Top 20 origins of international migrants in 2024",
caption = "Data source: United Nations 2024") +
scale_x_continuous(labels = comma) +
theme_minimal()
p2 <- count_pairs %>%
mutate(iso2 = tolower(iso2)) %>%
mutate(country = fct_reorder(substr(country, 1, 12), resident)) %>%
arrange(-resident) %>%
head(n = 20) %>%
ggplot(aes(resident, country, fill = continent)) +
geom_col() +
geom_flag(x = 0, aes(country = iso2), size = 10) +
labs(y = "Country", x = "Immigrants",
title = "Top 20 destinations of international migrants in 2024",
caption = "Data source: United Nations 2024") +
scale_x_continuous(labels = comma) +
theme_minimal()
gridExtra::grid.arrange(p1, p2, ncol = 2)We present the same graphics, but with data aggregated by geographic regions.
count_pairs$region23 <- countrycode(count_pairs$iso2, "iso2c", "region23")
temp_mig <- count_pairs %>%
group_by(region23) %>%
summarise(abroad = sum(abroad),
resident = sum(resident))
p1 <- temp_mig %>%
mutate(region23 = fct_reorder(substr(region23, 1, 12), abroad)) %>%
arrange(-abroad) %>%
ggplot(aes(abroad, region23)) +
geom_col() +
labs(y = "Country", x = "Emmigrants",
title = "Origins of migrants in 2024",
caption = "Data source: United Nations 2024") +
scale_x_continuous(labels = comma) +
theme_minimal()
p2 <- temp_mig %>%
mutate(region23 = fct_reorder(substr(region23, 1, 12), resident)) %>%
arrange(-resident) %>%
ggplot(aes(resident, region23)) +
geom_col() +
labs(y = "Country", x = "Immigrants",
title = "Destinations of migrants in 2024",
caption = "Data source: United Nations 2024") +
scale_x_continuous(labels = comma) +
theme_minimal()
gridExtra::grid.arrange(p1, p2, ncol = 2)We now compare the origin of emigrants and destination of immigrants across continents for countries with the highest total migration stocks (emigrants + immigrants).
Asia: The net migration balance (emigrants - immigrants) is positive for the countries represented, meaning that more people are emigrating from these countries than immigrating to them.
Europe: The net migration balance is positive for countries affected by war (Russia, Ukraine) and for countries like Poland, Romania, and Portugal, which have more citizens living abroad than foreign migrants residing within their borders. In contrast, countries such as Germany, the United Kingdom, France, and Italy have a negative balance, as they attract more migrants than they send abroad.
Americas: The United States has the highest net migration difference, with a significant number of emigrants. However, for most other countries in the region, the opposite trend is observed, with more people leaving than arriving.
Africa: The migration pattern is similar to that of Asia, with all represented countries having more emigrants than immigrants. However, it is notable that many African countries still attract a considerable number of migrants.
Oceania: In contrast, Australia and New Zealand experience a negative migration balance, meaning they have more immigrants settling in their territories than emigrants leaving.
my_continent <- c("Asia", "Europe", "Americas", "Africa", "Oceania")
pop_range_breaks <- list(
c(-6000000, -4000000, -2000000, 0, 2000000, 4000000, 6000000),
c(-6000000, -4000000, -2000000, 0, 2000000, 4000000, 6000000),
c(-5000000, 0, 15000000, 30000000),
c(-2000000, -1000000, 0, 1000000, 2000000),
c(-500000, 0, 2500000, 5000000))
res_p <- vector("list", 5)
for(k in 1:5){
res_p[[k]] <- count_pairs_pivot %>%
filter(continent == my_continent[k]) %>%
mutate(Values = ifelse(Migrants == "abroad", -Values, Values)) %>%
head(n = 20) %>%
ggplot(aes(Values, country, fill = Migrants)) +
geom_col() +
geom_flag(x = 0, aes(country = iso2), size = 4) +
scale_x_continuous(breaks = pop_range_breaks[[k]],
labels = scales::comma(abs(pop_range_breaks[[k]]))) +
scale_fill_brewer(palette = "Dark2") +
theme_minimal() +
theme(legend.position = "top")
}
gridExtra::grid.arrange(res_p[[1]], res_p[[2]], res_p[[3]],
res_p[[4]], res_p[[5]], ncol = 5)For a given country of birth, we can also visualize the distribution of migrants across their countries of residence. It is notable that migrants from a particular country tend to settle in destinations within the same continent. For example:
Most Venezuelan migrants reside in Colombia, Peru, the United States, Brazil, and Ecuador. Spain also ranks among the top destinations, likely due to shared linguistic and cultural ties.
A significant number of migrants from India, Pakistan, and Bangladesh live in the Arabian Persian Gulf region, likely driven by economic opportunities.
Migrants from Syria, Afghanistan, and Myanmar primarily settle in neighboring countries: Turkey and Jordan for Syrians, Iran and Pakistan for Afghans, and Thailand and Bangladesh for Myanmar nationals.
Migrants from Morocco tend to follow a similar pattern to those from Romania, primarily settling in neighboring or nearby countries. Moroccan migrants predominantly reside in Spain and France, while Romanian migrants are mainly found in Germany and Italy.
p <- vector("list", 20)
my_col <- RColorBrewer::brewer.pal(5, "Set1")
dest_name <- countrycode(count_pairs$country[order(-count_pairs$abroad)[1:20]],
"country.name", "iso3c")
code_colors <- c("Americas" = my_col[1], "Asia" = my_col[2],
"Europe" = my_col[3], "Oceania" = my_col[4],
"Africa" = my_col[5])
for(k in 1:20)
p[[k]] <- un_code %>%
filter(birth_code == dest_name[k]) %>%
mutate(iso2 = tolower(countrycode(dest, "country.name", "iso2c")),
continent = factor(countrycode(iso2, "iso2c", "continent"))) %>%
mutate(country = fct_reorder(substr(dest, 1, 12), tot_2024)) %>%
arrange(-tot_2024) %>%
head(n = 15) %>%
ggplot(aes(tot_2024, country, fill = factor(continent))) +
geom_col() +
geom_flag(x = 0, aes(country = iso2), size = 5) +
labs(y = "Country", x = "Immigrants",
title = paste0("Migrants from ", dest_name[k]),
caption = "Data source: United Nations 2024") +
scale_x_continuous(labels = comma) +
scale_fill_manual(values = code_colors) +
theme_minimal()
gridExtra::grid.arrange(p[[1]], p[[2]], p[[3]], p[[4]], p[[5]], p[[6]],
p[[7]], p[[8]], p[[9]], p[[10]], p[[11]], p[[12]], p[[13]], p[[14]],
p[[15]], p[[16]], p[[17]], p[[18]], p[[19]], p[[20]], ncol = 5)From the previous bar charts, it is evident that migrant distribution for a given country is not uniform across destination countries. To confirm this, we plot the Lorenz curve \((M_i/M_k,P_i/P_k)\), where for a given country \(k\), \(M_i\) represents the number of migrants from country \(k\) residing in country \(i\), and \(P_i\) is the population of country \(i\). Here, \(M_k\) denotes the total number of migrants abroad from country \(k\), while \(P_k\) represents the total population of all destination countries hosting at least one migrant from country \(k\). The observations are ordered according to the ratio \(M_i/P_i\).
Our dot chart confirms that, for a given country, a significant proportion of migrants reside in just a few destination countries.
2.3 Statistics on the neighbours
We first calculate the shortest possible distance between the borders of the origin and destination countries. For instance, if a person migrates from Spain to France, the measured distance would be the shortest distance between the closest points on their borders. Since Spain and France share a common frontier, this distance would be zero.
## Spherical geometry (s2) switched off
world_sf <- GeoDATA %>%
rename(region = SIREGION, iso3 = ISO3)
countries_regions <- world_sf %>%
rmapshaper::ms_simplify(, keep = 0.05, keep_shapes = T)
# polygons with same country than UN data basis
my_proj <- '+proj=robin +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs'
world <- st_transform(countries_regions, my_proj)
world_union <- st_union(world)
# contours, longitude and latitude of the map
p1 <- rbind(c(-180, -90),
c(180, -90),
cbind(180, seq(-90, 90, by = 1)),
c(180, 90),
c(-180, 90),
cbind(-180, seq(90, -90, by = -1)),
c(-180, -90))
border_sf <- st_sfc(st_polygon(list(p1)))
long_sf <- st_sfc(st_linestring(cbind(-150, seq(-90, 90, by = 1))),
st_linestring(cbind(-120, seq(-90, 90, by = 1))),
st_linestring(cbind(-90, seq(-90, 90, by = 1))),
st_linestring(cbind(-60, seq(-90, 90, by = 1))),
st_linestring(cbind(-30, seq(-90, 90, by = 1))),
st_linestring(cbind(0, seq(-90, 90, by = 1))),
st_linestring(cbind(150, seq(-90, 90, by = 1))),
st_linestring(cbind(120, seq(-90, 90, by = 1))),
st_linestring(cbind(90, seq(-90, 90, by = 1))),
st_linestring(cbind(60, seq(-90, 90, by = 1))),
st_linestring(cbind(30, seq(-90, 90, by = 1)))
)
lat_sf <- st_sfc(st_linestring(cbind(seq(-180, 180, by = 1), -60)),
st_linestring(cbind(seq(-180, 180, by = 1), -30)),
st_linestring(cbind(seq(-180, 180, by = 1), -0)),
st_linestring(cbind(seq(-180, 180, by = 1), 30)),
st_linestring(cbind(seq(-180, 180, by = 1), 60))
)
# Coordinate Reference System
st_crs(border_sf) <- 4326
st_crs(long_sf) <- 4326
st_crs(lat_sf) <- 4326
border_sf <- st_transform(border_sf, my_proj)
long_sf <- st_transform(long_sf, my_proj)
lat_sf <- st_transform(lat_sf, my_proj)
sea <- st_difference(border_sf, world_union)# merge with Un data
un_code <- merge(un_code, my_dist, by.x = c("dest_code", "birth_code"),
by.y = c("from", "to"))The chart below illustrates these distances for all international migrant populations worldwide, while also highlighting the total number of people living outside their home country in 2024.
Example of interpretation:
\(36\%\) of migrants worldwide reside in a neighboring country, meaning a country with which their country of origin shares a common border.
\(50\%\) of migrants live in a country located less than 750 km from their country of origin.
temp <- un_code
temp <- temp[order(temp$my_dist), ]
temp <- aggregate(temp[, "tot_2024"], by = list(my_dist = temp$my_dist),
FUN = sum)
temp$x <- cumsum(temp$x)/sum(temp$x)
par(las = 1)
plot(temp$my_dist / 1000, temp$x, type = "l",
xlab = "distance from country birth (km)",
ylab = "share of migrants", ylim = c(0, 1))
abline(v = 0, lty = 2, col = "grey")For each country, we map the preferred destinations where at least \(80%\) of its emigrants reside. This visualization confirms that migrants from a given country generally settle in nearby or regional destinations. However, the United States stands out as a notable exception, consistently appearing as a key destination for migrants from diverse origins worldwide.
countries_regions <- st_transform(countries_regions, my_proj)
p <- vector("list", 20)
my_col <- RColorBrewer::brewer.pal(5, "Set1")
dest_name <- countrycode(count_pairs$country[order(-count_pairs$abroad)[1:20]],
"country.name", "iso3c")
code_colors <- c("AMERICAS" = my_col[1], "ASIA" = my_col[2],
"EUROPE" = my_col[3], "OCEANIA" = my_col[4],
"AFRICA" = my_col[5])
pdf("test.pdf", width = 20, height = 8.5)
par(mfrow = c(4, 5), oma = c(0, 0, 0, 0), mar = c(0, 0, 0.5, 0))
for(k in 1:20) {
temp_pays <- un_code %>%
filter(birth_code == dest_name[k]) %>%
mutate(iso2 = tolower(countrycode(dest, "country.name", "iso2c")),
continent = factor(countrycode(iso2, "iso2c", "continent"))) %>%
mutate(country = fct_reorder(substr(dest, 1, 12), tot_2024)) %>%
arrange(tot_2024) %>%
mutate(pct_rate = 1 - cumsum(tot_2024 / sum(tot_2024))) %>%
filter(pct_rate < 0.80)
plot(sea, col = "lightblue", border = rgb(0.2, 0.2, 0.2))
title(dest_name[k], line = -0.5)
plot(st_geometry(world_union), add = T, border = rgb(0.2, 0.2, 0.2))
plot(st_geometry(countries_regions[countries_regions$iso3 == dest_name[k],]),
col = "yellow", border = code_colors[countries_regions[which(countries_regions$iso3 ==
dest_name[k]),][["CONTINENT"]]], lwd = 1.3, add = T)
plot(st_geometry(countries_regions[countries_regions$ISO2 %in%
toupper(temp_pays$iso2),]),
col = code_colors[countries_regions[countries_regions$ISO2 %in%
toupper(temp_pays$iso2),][["CONTINENT"]]], add = T)
plot(long_sf, add = T, col = "lightgrey")
plot(lat_sf, add = T, col = "lightgrey")
}
dev.off()## quartz_off_screen
## 2
3 Demographic Adjustements
Before the estimation process, we compute the matrices \(S(t_k)\) for the different years \(t_k = 1990, 1995, 2000, 2005, 2010, 2015, 2020, 2024\). We define three versions of the stock tables as presented in Subsection 2.3. of the article:
Raw data (directly from the IMS): The stock tables remain unchanged. For each period \(T = [t_0, t_1]\), the IMS data are converted into two square matrices, \(S(t_0)\) and \(S(t_1)\), representing the stock tables at times \(t_0\) and \(t_1\).
Open demographic adjustment following Abel (2013): The row \(k\) of the stock tables \(S(t_0)\) and \(S(t_1)\) is adjusted so that the row margins of both tables are equal and correspond to the minimum of the row margins of \(S(t_0)\) and \(S(t_1)\).
Closed demographic adjustment following Abel and Sander (2014): The row \(k\) of the stock tables \(S(t_0)\) and \(S(t_1)\) is adjusted so that the row margins of both tables are equal and correspond to the mid-range of the row margins of \(S(t_0)\) and \(S(t_1)\), with the constraint that the total resident population remains unchanged.
We make several assumptions:
If no information is available on migration between a country of birth \(i\) and a country of residence \(j\), we assume that there are no migrants from \(i\) in \(j\). This assumption impacts 82.4\(%\) of all possible country pairs, given the \(230 \times 230\) combinations. Additionally, some country pairs explicitly report zero values. The proportion of zeros in our stock tables varies between
(r round(0.56 + 100 * (1 - nrow(un_code) / 230^2), 1), 84.9)\(%\), depending on the year.Opened and closed demographic adjustments are performed in three stages:
- Stage 1 “Stayers \(S_i^i\)”: The diagonal elements, which represent the number of individuals born in country \(i\) and residing in \(i\) at time \(t\), are set equal to the total population at \(t\) minus the number of migrants living in \(i\) at \(t\). This adjustment is achieved by integrating data from the WPP 2024 and IMS databases.
- Stage 2 “Birth and Death Adjustment”: The number of deaths in each country over period \(T\) is allocated proportionally across all countries of birth, including migrants whose country of birth is classified as “Unknown.”
- Stage 3 “Row-Margin Adjustments (see above)”: This step ensures consistency in stock tables by applying methods primarily implemented in the migest package.
ptm <- proc.time()
for (k in 1:(length(years) - 1)) {
my_period <- paste0(years[k], "-", years[k + 1])
cat("Period: ", my_period, "\n")
for (l in c("tot", "m", "f")) {
cat(" Sex: ", l, "\n")
# create the stock tables as matrices
s1 <- pivot_wider(data = un_code[, c("dest_code", "birth_code",
paste0(l, "_", years[k]))], names_from = 1, values_from = 3)
s2 <- pivot_wider(data = un_code[, c("dest_code", "birth_code",
paste0(l, "_", years[k + 1]))], names_from = 1, values_from = 3)
# permute and give row and col names for s1
birth_code <- s1$birth_code
temp_re <- s1$birth_code[!s1$birth_code %in% colnames(s1)]
if(length(temp_re) >= 1) {
s1[, temp_re] <- 0
}
s1 <- as(s1[, country$iso], "matrix")
row.names(s1) <- birth_code
s1 <- s1[country$iso, ]
# permute and give row and col names for s2
birth_code <- s2$birth_code
temp_re <- s2$birth_code[!s2$birth_code %in% colnames(s2)]
if(length(temp_re) >= 1) {
s2[, temp_re] <- 0
}
s2 <- as(s2[, country$iso], "matrix")
row.names(s2) <- birth_code
s2 <- s2[country$iso, ]
# H1 - Replace NA by 0
s1[is.na(s1)] <- 0
s2[is.na(s2)] <- 0
# H2 - we add the diagonal elements
demo_k <- demo %>% filter(years[k] == as.integer(substr(demo$period, 1, 4)))
# birth and death vector
my_birth <- demo_k[match(country$code, demo_k$LocID), paste0(l, "_birth")]
names(my_birth) <- country$iso
my_death <- demo_k[match(country$code, demo_k$LocID), paste0(l, "_death")]
names(my_death) <- country$iso
# native population
diag(s1) <- demo_k[match(country$code, demo_k$LocID), paste0(l, "_native_t0")]
diag(s2) <- demo_k[match(country$code, demo_k$LocID), paste0(l, "_native_t1")]
# replace 0 and -1 at s0 by 1
diag(s1)[diag(s1) <= 1] <- 1
# replace 0 and -1 by the number of birth
diag(s2)[diag(s2) <= 1] <- my_birth[diag(s2) <= 1]
#########
# Demographic Adjustment : birth and death adjustments
# We add the row Others to s1
my_other <- un_others_count[match(country$code, un_others_count$dest_code),
paste0(l, "_", years[k])] %>% pull()
my_other[is.na(my_other)] <- 0
############## for Open method ###########################################
# a. birth/death adjustement
s2_open <- s2 - diag(my_birth)
# we take into account the Others (H3 - Death Adjustment)
s1_open <- rbind(s1, my_other)
for (i in 1:ncol(s1)) {
s1_open[, i] <- s1_open[, i] - jefferson(s1_open[, i], my_death[i])
}
# we drop the last row corresponding to Others
s1_open <- s1_open[-nrow(s1_open), ]
# b. row-margins adjustment
sum_1 <- rowSums(s1_open)
sum_2 <- rowSums(s2_open)
diff <- abs(sum_2 - sum_1)
for (i in 1:nrow(s1_open)) {
if (sum_1[i] < sum_2[i]) {
s2_open[i, ] <- s2_open[i, ] - jefferson(s2_open[i, ], diff[i])
} else {
s1_open[i, ] <- s1_open[i, ] - jefferson(s1_open[i, ], diff[i])
}
}
############## for Close method ###########################################
# a. Normally, the difference of total population between S1 and S0 should
# be explained by the natural increase. If it is not the case, we report this difference
# compute the number of deaths whic are different from "others"
my_death_2 <- my_death * colSums(s1) / colSums(rbind(s1, my_other))
# on the stayers
pop_grow <- sum(s2 - s1)
nat_grow <- sum(my_birth - my_death_2)
dd <- nat_grow - pop_grow
tot1 <- sum(diag(s1))
tot2 <- sum(diag(s2))
s1_closed <- s1
s2_closed <- s2
if (round(dd%%2) == 0 & dd%%2 != 0) {
diag(s2_closed) <- rescale_integer_sum(x = diag(s2_closed),
tot = tot2 + 1)
pop_grow <- sum(s2_closed - s1_closed)
dd <- nat_grow - pop_grow
tot2 <- sum(diag(s2_closed))
}
if (dd != 0) {
diag(s1_closed) <- migest::rescale_integer_sum(x = diag(s1_closed),
tot = tot1 - dd/2)
diag(s2_closed) <- migest::rescale_integer_sum(x = diag(s2_closed),
tot = tot2 + dd/2)
}
# b. birth/death adjustment
#d_mat <- migest:::death_mat(d_por = my_death, m1 = s1_closed, method = death_method,
# m2 = s2_closed, b_por = my_birth)
#s1_closed <- s1_closed - d_mat
# Check non negative values (code from function birth_mat())
xx <- diag(s2_closed) - my_birth < 0
if (sum(xx) > 0) {
temp <- mipfp::Ipfp(seed = s2_closed, target.list = list(2),
target.data = list(my_birth/2),
tol = 1, iter = 1e+05,
tol.margins = 1)$x.hat[, xx]
}
s2_closed <- s2_closed - diag(my_birth)
if (sum(xx) > 0) {
s2_closed[, xx] <- temp
}
# death adjustment by taking into account the Others (H3 - Death Adjustment)
# s1_closed <- rbind(s1_closed, my_other)
for (i in 1:ncol(s1)) {
# s1_closed[, i] <- s1_closed[, i] - jefferson(s1_closed[, i], my_death_2[i])
s1_closed[, i] <- s1_closed[, i] -
my_death_2[i] * s1_closed[, i] / sum(s1_closed[, i])
}
# we drop the last row corresponding to Others
# s1_closed <- s1_closed[-nrow(s1_closed), ]
#########
# c. To get the same number of observations in s1 and s2
# we adjust with the stayers
#dd <- sum(s2_closed) - sum(s1_closed)
#if (dd != 0) {
# diag(s1_closed) <- diag(s1_closed) + jefferson(diag(s1_closed), ceiling(dd * 0.5))
# diag(s2_closed) <- diag(s2_closed) - jefferson(diag(s2_closed), floor(dd * 0.5))
#}
# we adjust the row sums
dd <- rowSums(s1_closed) - rowSums(s2_closed)
s1_closed <- mipfp::Ipfp(seed = s1_closed, target.list = list(1, 2),
target.data = list(rowSums(s1_closed) - dd/2, colSums(s1_closed)),
tol = 0.00001, iter = 1e+05, tol.margins = 0.00001)
s2_closed <- mipfp::Ipfp(seed = s2_closed, target.list = list(1, 2),
target.data = list(rowSums(s2_closed) + dd/2, colSums(s2_closed)),
tol = 0.00001, iter = 1e+05, tol.margins = 0.00001)
s1_closed <- s1_closed$x.hat
s2_closed <- s2_closed$x.hat
################
save(s1, s2, s1_open, s2_open, s1_closed, s2_closed,
file = paste0("results/stock_", l, "_", years[k], ".RData"))
}
}4 Preparation of the Optimal Transport Method
In the first part of the Optimal Transpoot methodology (section 4 in the article), we leverage auxiliary information to deepen our understanding of migration dynamics by focusing on two key goals:
- Estimating Migration Flows: We aim to estimate the outflows and inflows for each country-of-birth and country-of-residence pair.
- Estimating Return Migration: We also seek to estimate the proportion of returnees among the outflows—migrants who leave a country of residence to return to their country of birth.
4.1 Eurostat
This dataset is used to estimate outflows and inflows relative to stock data (See subsection 4.2 in the article). The source is available here.
# outflows
test <- read.table("data/estat_migr_emi4ctb.tsv", sep = '\t', header = TRUE)
test <- test %>%
mutate_at(vars(`X2023`:`X2008`), str_extract, pattern = "[0-9]+") %>%
mutate_at(vars(`X2023`:`X2008`), as.numeric)
test <- test %>%
separate(`freq.age.agedef.c_birth.unit.sex.geo.TIME_PERIOD`,
c("freq", "age", "agedef", "c_birth", "unit", "sex", "geo", "TIME_PERIOD"),
sep = ",") %>%
filter(agedef == "COMPLET", age == "TOTAL", sex == "T") %>%
dplyr::select(4, 7, 9:24) %>%
rename(birth = 1, REF_AREA = 2) %>%
pivot_longer(cols = 3:18, names_to = "TIME_PERIOD", values_to = "outflows")
test$TIME_PERIOD <- as.numeric(substr(test$TIME_PERIOD, 2, 5))
test <- test[!is.na(test$outflows), ]
# convert iso2 to iso3
test$birth <- countrycode(test$birth, "iso2c", "iso3c")
test$REF_AREA <- countrycode(test$REF_AREA, "iso2c", "iso3c")
test <- test[!is.na(test$birth), ]
test <- test[!is.na(test$REF_AREA), ]
y_eurostat <- test[test$birth != test$REF_AREA, ]
# inflows
test <- read.table("data/estat_migr_imm3ctb.tsv", sep = '\t', header = TRUE)
test <- test %>%
mutate_at(vars(`X2023`:`X2008`), str_extract, pattern = "[0-9]+") %>%
mutate_at(vars(`X2023`:`X2008`), as.numeric)
test <- test %>%
separate(`freq.age.agedef.c_birth.unit.sex.geo.TIME_PERIOD`,
c("freq", "age", "agedef", "c_birth", "unit", "sex", "geo", "TIME_PERIOD"),
sep = ",") %>%
filter(agedef == "COMPLET", age == "TOTAL", sex == "T") %>%
dplyr::select(4, 7, 9:24) %>%
rename(birth = 1, REF_AREA = 2) %>%
pivot_longer(cols = 3:18, names_to = "TIME_PERIOD", values_to = "inflows")
test$TIME_PERIOD <- as.numeric(substr(test$TIME_PERIOD, 2, 5))
test <- test[!is.na(test$inflows), ]
# convert iso2 to iso3
test$birth <- countrycode(test$birth, "iso2c", "iso3c")
test$REF_AREA <- countrycode(test$REF_AREA, "iso2c", "iso3c")
test <- test[!is.na(test$birth), ]
test <- test[!is.na(test$REF_AREA), ]
test <- test[test$birth != test$REF_AREA, ]
y_eurostat <- merge(y_eurostat, test,
by = c("birth", "REF_AREA", "TIME_PERIOD"),
all = T)
# keep only the pairs of country available
codes_iso3 <- c("AFG", "ALB", "DZA", "ASM", "AND", "AGO", "AIA", "ATG", "ARG",
"ARM", "ABW", "AUS", "AUT", "AZE", "BHS", "BHR", "BGD", "BRB", "BLR", "BEL",
"BLZ", "BEN", "BMU", "BTN", "BOL", "BIH", "BWA", "BRA", "BRN", "BGR", "BFA",
"BDI", "CPV", "KHM", "CMR", "CAN", "CYM", "CAF", "TCD", "CHL", "CHN", "COL",
"COM", "COG", "COD", "COK", "CRI", "CIV", "HRV", "CUB", "CUW", "CYP", "CZE",
"DNK", "DJI", "DMA", "DOM", "ECU", "EGY", "SLV", "GNQ", "ERI", "EST", "SWZ",
"ETH", "FLK", "FRO", "FJI", "FIN", "FRA", "GUF", "PYF", "GAB", "GMB", "GEO",
"DEU", "GHA", "GIB", "GRC", "GRL", "GRD", "GLP", "GUM", "GTM", "GIN", "GNB",
"GUY", "HTI", "HND", "HKG", "HUN", "ISL", "IND", "IDN", "IRN", "IRQ", "IRL",
"IMN", "ISR", "ITA", "JAM", "JPN", "JOR", "KAZ", "KEN", "KIR", "PRK", "KOR",
"KWT", "KGZ", "LAO", "LVA", "LBN", "LSO", "LBR", "LBY", "LIE", "LTU", "LUX",
"MAC", "MDG", "MWI", "MYS", "MDV", "MLI", "MLT", "MHL", "MTQ", "MRT", "MUS",
"MYT", "MEX", "FSM", "MDA", "MCO", "MNG", "MNE", "MSR", "MAR", "MOZ", "MMR",
"NAM", "NRU", "NPL", "NLD", "NCL", "NZL", "NIC", "NER", "NGA", "NIU", "MKD",
"MNP", "NOR", "OMN", "PAK", "PLW", "PSE", "PAN", "PNG", "PRY", "PER", "PHL",
"POL", "PRT", "PRI", "QAT", "REU", "ROU", "RUS", "RWA", "SHN", "KNA", "LCA",
"SPM", "VCT", "WSM", "SMR", "STP", "SAU", "SEN", "SRB", "SYC", "SLE", "SGP",
"SXM", "SVK", "SVN", "SLB", "SOM", "ZAF", "SSD", "ESP", "LKA", "SDN", "SUR",
"SWE", "CHE", "SYR", "TWN", "TJK", "TZA", "THA", "TLS", "TGO", "TKL", "TON",
"TTO", "TUN", "TUR", "TKM", "TCA", "TUV", "UGA", "UKR", "ARE", "GBR", "USA",
"URY", "UZB", "VUT", "VEN", "VNM", "VGB", "VIR", "WLF", "ESH", "YEM", "ZMB", "ZWE")
n_country <- length(codes_iso3)
# we keep only the outflows/inflows of pairs we do have in the UN stocks
stock_time <- data.frame(
birth = rep(codes_iso3, each = n_country),
residence = rep(codes_iso3, times = n_country))
y_eurostat <- merge(y_eurostat, stock_time[, c("residence", "birth")],
by.x = c("birth", "REF_AREA"),
by.y = c("birth", "residence"))Few Statistics on Inflows/outflows:
par(mfcol = c(2, 1), oma = c(0, 0, 0, 0), mar = c(3, 6, 0.5, 0.5), las = 1,
mgp = c(4, 1, 0))
y_eurostat %>%
group_by(TIME_PERIOD) %>%
summarise(inflows = sum(inflows, na.rm = T)) %>%
plot(type = "l", ylim = c(0, 4500000))
y_eurostat %>%
group_by(TIME_PERIOD) %>%
summarise(outflows = sum(outflows, na.rm = T)) %>%
plot(type = "l", ylim = c(0, 4500000))Another example between two countries :
par(mfcol = c(2, 1), oma = c(0, 0, 0, 0), mar = c(3, 6, 0.5, 0.5),
las = 1, mgp = c(4, 1, 0))
y_eurostat %>%
filter(birth == "MAR", REF_AREA == "ESP") %>%
pivot_longer(cols = 4:5, names_to = "type", values_to = "Flows") %>%
ggplot(aes(x = TIME_PERIOD, y = Flows, col = type)) +
geom_line() +
xlab("Years") +
ylab("Migration") +
ggtitle("Movements to and from Spain of Moroccan-Born Individuals") 4.2 Stocks
Based on the quinquennial stock tables, we interpolate annual stock data (see Subsection 4.1 of the article) and merge it with the outflows/inflows dataset from Eurostat.
stock_long <- data.frame(
birth = character(),
REF_AREA = character(),
year = numeric(),
stock_0_open = numeric(),
stock_1_open = numeric(),
stock_0_closed = numeric(),
stock_1_closed = numeric(),
inflows = numeric(),
outflows = numeric()
)
##########. STOCK.
t_time <- as.character(seq(1990, 2020, 5))
for(k in t_time) {
load(paste0("results/stock_tot_", k, ".RData"))
n_country <- nrow(s1)
if(k != "2020") {
temp <- data.frame(
birth = rep(colnames(s1), each = n_country),
residence = rep(colnames(s1), times = n_country),
t0 = as.vector(t(s1)),
t5 = as.vector(t(s2)),
t0_open = as.vector(t(s1_open)),
t5_open = as.vector(t(s2_open)),
t0_closed = as.vector(t(s1_closed)),
t5_closed = as.vector(t(s2_closed))) %>%
mutate(t1 = t0 + 1 * (t5 - t0) / 5,
t2 = t0 + 2 * (t5 - t0) / 5,
t3 = t0 + 3 * (t5 - t0) / 5,
t4 = t0 + 4 * (t5 - t0) / 5,
t1_open = t0_open + 1 * (t5_open - t0_open) / 5,
t2_open = t0_open + 2 * (t5_open - t0_open) / 5,
t3_open = t0_open + 3 * (t5_open - t0_open) / 5,
t4_open = t0_open + 4 * (t5_open - t0_open) / 5,
t1_closed = t0_closed + 1 * (t5_closed - t0_closed) / 5,
t2_closed = t0_closed + 2 * (t5_closed - t0_closed) / 5,
t3_closed = t0_closed + 3 * (t5_closed - t0_closed) / 5,
t4_closed = t0_closed + 4 * (t5_closed - t0_closed) / 5
)
# change name
names(temp)[3] <- paste0("s_", as.numeric(k))
names(temp)[4] <- paste0("s_", as.numeric(k) + 5)
names(temp)[5] <- paste0("s_open_", as.numeric(k))
names(temp)[6] <- paste0("s_open_", as.numeric(k) + 5)
names(temp)[7] <- paste0("s_closed_", as.numeric(k))
names(temp)[8] <- paste0("s_closed_", as.numeric(k) + 5)
names(temp)[9:12] <- paste0("s_", as.numeric(k) + c(1, 2, 3, 4))
names(temp)[13:16] <- paste0("s_open_", as.numeric(k) + c(1, 2, 3, 4))
names(temp)[17:20] <- paste0("s_closed_", as.numeric(k) + c(1, 2, 3, 4))
} else {
temp <- data.frame(
birth = rep(colnames(s1), each = n_country),
residence = rep(colnames(s1), times = n_country),
t0 = as.vector(t(s1)),
t4 = as.vector(t(s2)),
t0_open = as.vector(t(s1_open)),
t4_open = as.vector(t(s2_open)),
t0_closed = as.vector(t(s1_closed)),
t4_closed = as.vector(t(s2_closed))) %>%
filter(birth != residence) %>%
mutate(t1 = t0 + 1 * (t4 - t0) / 4,
t2 = t0 + 2 * (t4 - t0) / 4,
t3 = t0 + 3 * (t4 - t0) / 4,
t1_open = t0_open + 1 * (t4_open - t0_open) / 4,
t2_open = t0_open + 2 * (t4_open - t0_open) / 4,
t3_open = t0_open + 3 * (t4_open - t0_open) / 4,
t1_closed = t0_closed + 1 * (t4_closed - t0_closed) / 4,
t2_closed = t0_closed + 2 * (t4_closed - t0_closed) / 4,
t3_closed = t0_closed + 3 * (t4_closed - t0_closed) / 4)
names(temp)[3] <- paste0("s_", as.numeric(k))
names(temp)[4] <- paste0("s_", as.numeric(k)+4)
names(temp)[5] <- paste0("s_open_", as.numeric(k))
names(temp)[6] <- paste0("s_open_", as.numeric(k)+4)
names(temp)[7] <- paste0("s_closed_", as.numeric(k))
names(temp)[8] <- paste0("s_closed_", as.numeric(k)+4)
names(temp)[9:11] <- paste0("s_", as.numeric(k) + c(1, 2, 3))
names(temp)[12:14] <- paste0("s_open_", as.numeric(k) + c(1, 2, 3))
names(temp)[15:17] <- paste0("s_closed_", as.numeric(k) + c(1, 2, 3))
}
######. merge with outflows / inflows
for(my_year in (as.numeric(k)):(as.numeric(k)+ 4)) {
if(my_year == 2024)
break
est_year <- y_eurostat %>%
filter(TIME_PERIOD == my_year) %>%
merge(temp[, c("residence", "birth",
names(temp)[substr(names(temp), nchar(names(temp)) - 3, nchar(names(temp))) %in% c(my_year, my_year + 1)])],
by.x = c("birth", "REF_AREA"),
by.y = c("birth", "residence"), all = T)
# filter 0 and 1values
# initialization
stock_0_open <- est_year[[paste0("s_open_", my_year)]]
stock_1_open <- est_year[[paste0("s_open_", my_year + 1)]]
delta_1_open <- (stock_1_open - stock_0_open)
stock_0_closed <- est_year[[paste0("s_closed_", my_year)]]
stock_1_closed <- est_year[[paste0("s_closed_", my_year + 1)]]
delta_1_closed <- (stock_1_closed - stock_0_closed)
in_flows <- est_year$inflows
out_flows <- est_year$outflows
stock_long <- rbind(stock_long, data.frame(
birth = est_year$birth,
REF_AREA = est_year$REF_AREA,
year = my_year,
stock_0_open = stock_0_open,
stock_1_open = stock_1_open,
diff_open = delta_1_open,
stock_0_closed = stock_0_closed,
stock_1_closed = stock_1_closed,
diff_closed = delta_1_closed,
outflows = out_flows,
inflows = in_flows
))
}
}4.3 Prediction of outflows/inflows using a NLS model
For both open and closed adjusted stocks, we fit a Nonlinear Least Squares (NLS) model to explain outflows (respectively, inflows) as a function of the stock at time \(t\) (respectively, \(t+1\)). The following code produces the estimates presented in Table 3 of the article (corresponding to the models specified in Equations 19 and 20), as well as the results shown in Figure 6, which compares the performance of the Nonlinear Least Squares (NLS) and the Ordinary Linear Model (OLM) regressions.
# filter the data
all_data_outflows <- stock_long %>% filter(
birth != REF_AREA,
!is.na(stock_long$outflows),
!(birth == "RUS" & REF_AREA == "UKR"))
exlude_out <- stock_long %>% filter(
birth != REF_AREA,
!is.na(stock_long$outflows),
(birth == "RUS" & REF_AREA == "UKR"))
all_data_inflows <- stock_long %>% filter(
birth != REF_AREA,
!is.na(stock_long$inflows),
inflows < 200000)
exlude_in <- stock_long %>% filter(
birth != REF_AREA,
!is.na(stock_long$inflows),
inflows > 200000)
###### results
res_paper <- data.frame(type = character(),
type_flows = character(),
alpha = numeric(),
beta = numeric(),
RMSE_NLS = numeric(),
RMSE_OLM = numeric(),
size = numeric())
# NLS
nls_outflows <- vector("list", 2)
nls_inflows <- vector("list", 2)
names(nls_outflows) = names(nls_inflows) = c("open", "closed")
pdf("figures/lm_eurostat.pdf", width = 8, height = 6)
par(mfcol = c(2, 2), oma = c(0, 0, 0, 0), mar = c(3.2, 3.9, 1, 0.4),
mgp = c(2.9, 0.6, 0), las = 1)
ind_stock <- c(4, 5, 6)
for(type in c("open", "closed")) {
temp_all_data_outflows <- all_data_outflows[, c(1, 2, 3, ind_stock, 10, 11)]
names(temp_all_data_outflows)[c(4, 5, 6)] <- c("stock_0", "stock_1", "diff")
temp_all_data_inflows <- all_data_inflows[, c(1, 2, 3, ind_stock, 10, 11)]
names(temp_all_data_inflows)[c(4, 5, 6)] <- c("stock_0", "stock_1", "diff")
temp_all_data_outflows <- temp_all_data_outflows %>%
filter(outflows > 0, stock_0 >= outflows) %>%
mutate(rate_outflows = outflows / stock_0)
temp_all_data_inflows <- temp_all_data_inflows %>%
filter(inflows > 0, stock_1 >= inflows) %>%
mutate(rate_inflows = inflows / stock_1)
# computation of the regression coefficients for outflows
# Plot
plot(outflows ~ stock_0, data = temp_all_data_outflows, cex = 0.3, pch = 16,
xlim = c(0, 3200000), ylim = c(0, 75000), xlab = "Stock at time t \n",
ylab = "outflows ")
if (type == "open") {
points(exlude_out$stock_0_open, exlude_out$outflows, pch = 4,
col = "red")
title("Open Adjustement")
} else {
points(exlude_out$stock_0_closed, exlude_out$outflows, pch = 4,
col = "red")
title("Closed Adjustement")
}
x_pred <- data.frame(stock_0 = seq(0, 3200000, length.out = 1000))
temp_123 <- lm(outflows ~ stock_0, data = temp_all_data_outflows)
abline(temp_123, lty = 2, col = "cyan", lwd = 2)
my_pred <- predict(temp_123)
rmse_olm <- sqrt(mean(((my_pred - temp_all_data_outflows$outflows)) ^ 2))
nls_outflows[[type]] <- nls(outflows ~ a * stock_0 / (b + stock_0),
data = temp_all_data_outflows, start = list(a = 1, b = 1))
pred_nls <- predict(nls_outflows[[type]], newdata = x_pred)
lines(x_pred$stock_0, pred_nls, lty = 2, col = "magenta", lwd = 2)
my_pred <- predict(nls_outflows[[type]])
rmse_nls <- sqrt(mean(((my_pred - temp_all_data_outflows$outflows)) ^ 2))
res_paper <- rbind(res_paper,
data.frame(
type = type,
type_flows = "outflows",
alpha = coefficients(nls_outflows[[type]])[1],
beta = coefficients(nls_outflows[[type]])[2],
RMSE_NLS = rmse_nls,
RMSE_OLM = rmse_olm,
size = nrow(temp_all_data_outflows)))
########################################################
# inflows
# Plot
plot(inflows ~ stock_1, data = temp_all_data_inflows, cex = 0.3, pch = 16,
xlim = c(0, 3200000), ylim = c(0, 763000), xlab = "Stock at time t + 1 \n",
ylab = "inflows ")
if (type == "open") {
points(exlude_in$stock_1_open, exlude_in$inflows, pch = 4, col = "red")
} else {
legend("topright", legend = c("LM", "NLS", "influential pts"),
lty = c(2, 2, NA), col = c("cyan", "magenta", "red"),
pch = c(NA, NA, 4), cex = c(0.8, 0.8, 0.8))
points(exlude_in$stock_1_closed, exlude_in$inflows, pch = 4, col = "red")
}
x_pred <- data.frame(stock_1 = seq(0, 3200000, length.out = 1000))
temp_123 <- lm(inflows ~ stock_1 - 1, data = temp_all_data_inflows)
abline(temp_123, lty = 2, col = "cyan", lwd = 2)
my_pred <- predict(temp_123)
rmse_olm <- sqrt(mean(((my_pred - temp_all_data_inflows$inflows)) ^ 2))
x_pred <- data.frame(stock_1 = seq(0, 10000000, length.out = 1000))
nls_inflows[[type]] <- nls(inflows ~ a * stock_1 / (b + stock_1),
data = temp_all_data_inflows, start = list(a = 0.12345, b = 0.54321))
pred_nls <- predict(nls_inflows[[type]], newdata = x_pred)
lines(x_pred$stock_1, pred_nls, lty = 2, col = "magenta", lwd = 2)
my_pred <- predict(nls_inflows[[type]])
rmse_nls <- sqrt(mean(((my_pred - temp_all_data_inflows$inflows)) ^ 2))
res_paper <- rbind(res_paper,
data.frame(
type = type,
type_flows = "inflows",
alpha = coefficients(nls_inflows[[type]])[1],
beta = coefficients(nls_inflows[[type]])[2],
RMSE_NLS = rmse_nls,
RMSE_OLM = rmse_olm,
size = nrow(temp_all_data_inflows)))
ind_stock <- ind_stock + 3
}
dev.off()## quartz_off_screen
## 2
We generate outflow and inflow predictions for the entire sample—covering all country-of-residence / country-of-birth pairs—from 1990 to 2024, based on the adjusted Open and Closed stock tables.
for(type in c("open", "closed")) {
for(type_flows in c("outflows", "inflows")) {
## NLS
if(type_flows == "outflows") {
stock_long[, paste0("hat_", type_flows, "_", type)] <-
predict(nls_outflows[[type]], newdata =
data.frame(stock_0 = stock_long[,
paste0("stock_", ifelse(type_flows == "outflows", 0, 1), "_", type)]))
} else {
stock_long[, paste0("hat_", type_flows, "_", type)] <-
predict(nls_inflows[[type]], newdata =
data.frame(stock_1 = stock_long[,
paste0("stock_", ifelse(type_flows == "outflows", 0, 1), "_", type)]))
}
}
}
pred_out_in_flows <- stock_long %>%
pivot_longer(cols = 12:15, names_to = "method", values_to = "hat_values") %>%
mutate(method = substr(method, 5, nchar(method)))
pred_out_in_flows$type_flows <- ifelse(substr(pred_out_in_flows$method, 1, 2) == "ou",
"outflows", "inflows")
pred_out_in_flows$type <- ifelse(
substr(pred_out_in_flows$method,
nchar(pred_out_in_flows$method),
nchar(pred_out_in_flows$method)) == "n", "open", "closed")
pred_out_in_flows$values <- ifelse(pred_out_in_flows$type_flows == "outflows", pred_out_in_flows$outflows, pred_out_in_flows$inflows)pred_out_in_flows %>%
filter(!is.na(values)) %>%
ggplot(aes(x = values, y = hat_values)) +
geom_point() +
xlim(0, 100000) +
ylim(0, 100000) +
geom_smooth(method = 'lm') +
facet_grid(type ~ type_flows)4.4 Auxiliary data from IMEM
We use the IMEM database to calibrate the estimated percentage of return migration (Subsection 4.3 and Appendix B2 in the article).
4.4.1 Import the IMEM data
The database is available here
# correspondance between iso
# url <- "http://en.wikipedia.org/wiki/ISO_3166-1"
# tables_url_wiki <- url %>%
# read_html() %>%
# html_nodes("table") %>%
# html_table(fill = T)
# table_iso <- tables_url_wiki[[3]]
# save(table_iso, file = "table_iso.RData")
load("results/table_iso.RData")
# year 2002 - 2008
imem <- readxl::read_xlsx("data/IMEM_final_results2.xlsx")
temp_imem <- as.matrix(imem[which(imem[, 1] == "median"),
-c(1, 2, (ncol(imem)-20):ncol(imem))])
imem_median <- as.numeric(as.vector(t(temp_imem[-c(nrow(temp_imem)-2, nrow(temp_imem)-1,
nrow(temp_imem)), ])))
nom_country_iso2 <- c("AT", "BE", "BG", "CH", "CY", "CZ", "DE", "DK", "EE", "ES", "FI",
"FR", "GR", "HU" , "IE", "IS", "IT", "LI", "LT", "LU", "LV", "MT",
"NL", "NO", "PL", "PT", "RO" , "SE", "SI", "SK", "GB")
nom_country_imem <- character(length(nom_country_iso2))
temp <- as.data.frame(table_iso)
for(k in 1:length(nom_country_iso2)) {
nom_country_imem[k] <- temp[which(temp[, 2] == nom_country_iso2[k]), 3]
}
y_imem = data.frame(
origin = rep(nom_country_imem, each = length(nom_country_imem) * length(2002:2008)),
years = rep(2002:2008, times = length(nom_country_imem) ^ 2),
dest = rep(rep(nom_country_imem, each = length(2002:2008)), times = length(nom_country_imem)),
flow = imem_median)
# 2nd part of the data
imem_2 <- read.csv(file = "data/flows_ODT_long.csv")
nom_full <- unique(imem_2$origin)
nom_country_imem <- character(length(nom_full))
temp <- as.data.frame(table_iso)
for(k in 1:length(nom_full)) {
candidate <- temp[which(temp[, 1] == nom_full[k]), 3]
nom_country_imem[k] <- ifelse(length(candidate) == 1,
temp[which(temp[, 1] == nom_full[k]), 3],
"")
}
nom_country_imem[5] <- "CYP"
nom_country_imem[22] <- "NLD"
nom_country_imem[32] <- "GBR"
for(k in 1:length(nom_full)) {
imem_2[which(imem_2$origin == nom_full[k]), "origin"] <- nom_country_imem[k]
imem_2[which(imem_2$dest == nom_full[k]), "dest"] <- nom_country_imem[k]
}
imem_2 <- imem_2[imem_2$origin != "", ]
imem_2 <- imem_2[imem_2$dest != "", ]
imem_2 <- imem_2[!is.na(imem_2$flow_50.), ]
y_imem = rbind(y_imem,
data.frame(origin = imem_2$origin,
years = imem_2$year,
dest = imem_2$dest,
flow = imem_2$flow_50.))
# drop intra
y_imem <- y_imem[y_imem$origin != y_imem$dest, ]
# population data
temp_iso <- table_iso[, c(3, 4)]
names(temp_iso) <- c("origin", "M49")
my_pop <- wpp2024 %>%
dplyr::select(5, 11, 12) %>%
rename(M49 = 1, years = 2, pop = 3) %>%
merge(temp_iso, by = "M49")
my_pop$pop <- as.numeric(my_pop$pop)4.4.2 Optimization
We implement a function, my_function_to_opti(), designed
to find the optimal \(\alpha\)
parameters that minimize the LSER criterion (see Appendix B2 in the
article).
my_function_to_opti <- function(to_opti, type = "closed", type_res = "LSER") {
cat("-")
unique_years <- unique(y_imem$year)
nT <- length(unique_years)
# initialization
temp <- stock_long %>%
filter(year %in% unique_years)
n <- length(unique(temp$birth))
for(k_years in 1:nT) {
my_data <- stock_long %>%
filter(year == unique_years[k_years])
# initialization
stock_0 <- my_data[, paste0("stock_0_", type)]
stock_1 <- my_data[, paste0("stock_1_", type)]
birth <- my_data[ , "birth"]
residence <- my_data[ , "REF_AREA"]
hat_outflows <- my_data[ , paste0("hat_outflows_", type)]
hat_inflows <- my_data[ , paste0("hat_inflows_", type)]
my_esti <- my_transport(stock_0, stock_1, birth, residence, hat_outflows,
hat_inflows, rate_returns = to_opti)$res
temp[((k_years - 1) * n^2 + 1):(k_years * n^2), "esti"] <- as.vector(t(my_esti))
}
temp <- merge(temp, y_imem, by.x = c("birth", "REF_AREA", "year"),
by.y = c("origin", "dest", "years"))
temp <- merge(temp, my_pop,
by.x = c("birth", "year"),
by.y = c("origin", "years"))
if (type_res == "LSER")
return(mean(((temp$flow - temp$esti) / temp$pop) ^ 2))
else
return(cbind(temp$flow , temp$esti))
}optimize(my_function_to_opti, c(0, 1), type = "closed")
# 0.6424708
optimize(my_function_to_opti, c(0, 1), type = "open")
# 0.6757961We plot the scatterplot of the observed flows \(F_{ij}^{\text{IMEM}}(t)\) versus the estimated flows \(\hat{F}_{ij}(t)\), using the optimal values of \(\alpha\) (figure 7 in the article).
## -
## -
pdf("figures/imem.pdf", width = 9, height = 4)
par(mfrow = c(1, 2), oma = c(0, 0, 0, 0), mar = c(3.8, 4.3, 0.5, 0.8),
mgp = c(3.4, 0.65, 0), las = 1)
plot(y_closed_nls[, 1], y_closed_nls[, 2], xlab = "Flows (IMEM data) \n",
ylab = "Est. Flows using OT on closed data", xlim = c(0, 140000), ylim = c(0, 140000))
abline(0, 1, lty = 2, col = "grey")
plot(y_open_nls[, 1], y_open_nls[, 2], xlab = "Flows (IMEM data) \n", ylab = "Est. Flows using OT on open data",
xlim = c(0, 140000), ylim = c(0, 140000))
abline(0, 1, lty = 2, col = "grey")
dev.off()## quartz_off_screen
## 2
5 Simulation procedure
The algorithm is fully described in Section 5 of the article and in Appendix C.
5.1 Distribution of Input Parameters in the Simulation Framework
We plot the scatter plot of the pairs \((a^i_j, b^i_j)\), grouped by country of residence \(j\), where \(j\) is one of the following countries: Austria, Belgium, Bulgaria, Switzerland, Czech Republic, Germany, Denmark, Spain, Estonia, Finland, Croatia, Hungary, Ireland, Iceland, Italy, Liechtenstein, Lithuania, Luxembourg, Latvia, North Macedonia, Netherlands, Norway, Romania, Slovakia, Slovenia, Sweden (figure 8 in the article):
data_simu <- stock_long %>%
mutate(outflow_rate_open = outflows / stock_0_open,
outflow_rate_closed = outflows / stock_0_closed,
rate_inflows_outflows = inflows / outflows) %>%
filter(!is.na(outflow_rate_open), !is.na(rate_inflows_outflows),
stock_0_open > 0, stock_0_closed > 0, outflows > 0, inflows > 0) %>%
filter(birth %in% REF_AREA)
# outflows
data_simu_outflows <- data_simu %>%
select(birth, REF_AREA, year, outflow_rate_open) %>%
pivot_wider(names_from = "birth",
values_from = c("outflow_rate_open"))
temp_na <- as.matrix(data_simu_outflows[, -1])
temp_na_2 <- temp_na[, -1]
# we replace the large values of outflows (larger than 1) by NA
temp_na_2[temp_na_2 > 0.3 | temp_na_2 < 0.0001] <- NA
temp_na[, -1] <- temp_na_2
data_simu_outflows[, -1] <- missForest::missForest(temp_na)$ximp
# inflows
data_simu_inflows <- data_simu %>%
select(birth, REF_AREA, year, rate_inflows_outflows) %>%
pivot_wider(names_from = "birth",
values_from = c("rate_inflows_outflows"))
temp_na <- as.matrix(data_simu_inflows[, -1])
temp_na_2 <- temp_na[, -1]
# we replace the large values of rate of inflows (larger than 5) by NA
temp_na_2[temp_na_2 > 8 | temp_na_2 < 0.1] <- NA
temp_na[, -1] <- temp_na_2
data_simu_inflows[, -1] <- missForest::missForest(temp_na)$ximp
unique_country <- sort(unique(data_simu_inflows$REF_AREA))
#####
temp_plot <- data.frame(a = as.vector(as.matrix(data_simu_outflows[, -c(1, 2)])),
b = as.vector(as.matrix(data_simu_inflows[, -c(1, 2)])),
REF_AREA = as.vector(as.matrix(data_simu_outflows[, 1])),
birth = rep(colnames(data_simu_outflows[, -c(1, 2)]), nrow(data_simu_outflows)),
year = factor(as.vector(as.matrix(data_simu_outflows[, 2]))))temp_plot %>%
# filter(year == 2021) %>%
ggplot(aes(x = a, y = b)) +
geom_point(aes( colour = birth), size = 0.2) +
# geom_density_2d() +
facet_wrap(~REF_AREA)We plot the distribution of the \(\text{Beta}(6.5, 3.5)\) distribution used to simulate the parameter \(\alpha_i\) introduced in Step 3 of the simulation process (figure 9 in the article).
pdf("figures/beta.pdf", width = 7, height = 5)
par(las = 1, mar = c(3.8, 4, 0.5, 0.5), mgp = c(2.5, 1, 0))
x_seq <- seq(0, 1, length.out = 100)
plot(x_seq, dbeta(x_seq, 6.5, 3.5), type = "l",
xlab = "alpha", ylab = "Probability Density")
dev.off()## quartz_off_screen
## 2
5.2 Algorithm
We implement the algorithm described in Section 5 of the article:
names_method_esti <- c("drop", "reverse", "OT", "mig. rate", "IPFP", "PB")
# "drop (year)", "reverse (year)", "mig. rate (year)", "IPFP (year)", "PB (year)")
nb_method_esti <- length(names_method_esti)
nb_rep <- 100
mae_tab <- mape_tab <- rmse_tab <- matrix(0, nb_rep, nb_method_esti)
n <- 230my_M <- numeric(nb_rep)
res <- data.frame(
Method = character(nb_method_esti * nb_rep * n * (n - 1)),
outwards = numeric(nb_method_esti * nb_rep * n * (n - 1)),
returns = numeric(nb_method_esti * nb_rep * n * (n - 1)),
transit = numeric(nb_method_esti * nb_rep * n * (n - 1)),
MAE = numeric(nb_method_esti * nb_rep * n * (n - 1)),
estimation = numeric(nb_method_esti * nb_rep * n * (n - 1)),
sign_estimation = character(nb_method_esti * nb_rep * n * (n - 1)),
Y = numeric(nb_method_esti * nb_rep * n * (n - 1)),
RMAE = numeric(nb_method_esti * nb_rep * n * (n - 1)),
MSE = numeric(nb_method_esti * nb_rep * n * (n - 1)),
RMSE = numeric(nb_method_esti * nb_rep * n * (n - 1))
)
increment <- 1:(nb_method_esti * n * (n - 1))
# for the best predictor
best_pred <- character(nb_rep * n * (n - 1))
increment_best <- 1:(n * (n - 1))
countries <- LETTERS[1:n]
system.time(
# parameters for rate of outflows
for(b in 1:nb_rep) {
print(b)
set.seed(b)
load(file = paste0("results/stock_tot_",
sample(seq(1990, 2020, 5), size = 1), ".RData"))
mat_for_simu <- s1_closed
mat_for_simu[mat_for_simu == 0] <- 1
choose_country <- sample(1:230, n)
S1 <- mat_for_simu[choose_country, choose_country]
# choose n countries with replacement in the basis of a/b
outflow_rate <- inflow_rate <- matrix(0, n, n)
samp_a <- sample(unique_country, size = n, replace = T)
# for each country select a year randomly and choose out/in
for(i in 1:n){
# outflows
temp_basis <- data_simu_outflows %>%
filter(REF_AREA == samp_a[i]) %>%
select(-year, -REF_AREA)
# select a year randomly
id_sample <- sample(1:nrow(temp_basis), size = 1)
outflow_rate[i, ] <- as.numeric(temp_basis[id_sample, samp_a])
# inflows
temp_basis <- data_simu_inflows %>%
filter(REF_AREA == samp_a[i]) %>%
select(-year, -REF_AREA)
inflow_rate[i, ] <- as.numeric(temp_basis[id_sample, samp_a])
}
# true flows
Y <- matrix(0, nrow = n, ncol = n,
dimnames = list(From = countries, To = countries))
S1_all <- S1
alpha_i <- numeric(n)
mat_outwards <- matrix(0, n, n)
mat_returns <- matrix(0, n, n)
mat_transit <- matrix(0, n, n)
# Creation de la matrice F
my_F <- vector("list", n)
for(years in 1:5) {
# Initialisation de la matrice de flux people born in i
for (i in 1:n){
my_F[[i]] <- matrix(0, nrow = n, ncol = n,
dimnames = list(From = countries, To = countries))
}
S2_all <- matrix(0, nrow = n, ncol = n,
dimnames = list(From = countries, To = countries))
for (i in 1:n) {
# Taux de retour (alpha)
if (years == 1) {
alpha_i[i] <- rbeta(1, 6.5, 3.5)
}
total_out <- S1_all[i, ] * outflow_rate[i, ]
# Retours
return_flow <- alpha_i[i] * total_out[-i]
my_F[[i]][-i, i] <- return_flow
# Transit : réparti aléatoirement sur les autres pays
transit_flow <- (1 - alpha_i[i]) * total_out
for(k in (1:n)[-i]) {
temp_transit <- as.numeric(rmultinom(1, size = as.numeric(transit_flow[k]), S1_all[i, -c(i, k)]))
my_F[[i]][k, -c(i, k)] <- temp_transit
}
# outwards : we drop from the inflows the transit
total_in <- total_out * inflow_rate[i, ] - colSums(my_F[[i]])
my_F[[i]][i, -c(i)] <- ifelse(total_in[-i] > 0, total_in[-i], -total_in[-i] / 2)
# we stock the results by group
mat_returns[-i, i] <- mat_returns[-i, i] + my_F[[i]][-i, i]
for(k in (1:n)[-i]) {
mat_transit[k, -c(i, k)] <- mat_transit[k, -c(i, k)] + my_F[[i]][k, -c(i, k)]
}
mat_outwards[i, -c(i)] <- mat_outwards[i, -c(i)] + my_F[[i]][i, -c(i)]
# stayers
diag(my_F[[i]]) <- S1_all[i, ] - rowSums(my_F[[i]])
# stock tables
S2_all[i, ] <- colSums(my_F[[i]])
Y <- Y + my_F[[i]]
}
# update stock tables
S1_all <- S2_all
}
diag(Y) <- 1
# estimates
hat_drop <- my_diff(S1, S2_all, type = "drop")
hat_reverse <- my_diff(S1, S2_all, type = "reverse")
M <- sum(abs(apply(S2_all, 2, sum) - apply(S1, 2, sum)))
mig_rate <- round(M * (S1) / (sum(S1) - sum(diag(S1))))
diag(mig_rate) <- 0
hat_min_closed <- my_min(S1, S2_all)
hat_pb <- 0.87 * hat_min_closed + (1 - 0.87) * bayes(S1, S2_all)
diag(hat_pb) <- 0
# For transport
hat_transport <- matrix(0, n, n)
# hat_drop_yearly <- hat_reverse_yearly <-
# mig_rate_yearly <- hat_min_closed_yearly <-
# hat_pb_yearly <- matrix(0, n, n)
# Compute stocks tables per year
s_year_t0 <- S1
for(time_k in 1:5) {
s_year_t1 = S1 + time_k * (S2_all - S1) / 5
# vectorize
new_data <- data.frame(
stock_0 = as.vector(s_year_t0),
stock_1 = as.vector(s_year_t1),
birth = rep(row.names(S1), n),
residence = rep(row.names(S1), each = n))
hat_outflows <- predict(nls_outflows[["closed"]], newdata = new_data)
hat_inflows <- predict(nls_inflows[["closed"]], newdata = new_data)
temp_time <- my_transport(new_data$stock_0, new_data$stock_1,
new_data$birth, new_data$residence,
hat_outflows, hat_inflows, rate_returns = 0.6424708)$res
hat_transport <- hat_transport + temp_time
# other methods computed year by year
# estimates
#hat_drop_yearly <- hat_drop_yearly +
# my_diff(s_year_t0, s_year_t1, type = "drop")
#hat_reverse_yearly <- hat_reverse_yearly +
# my_diff(s_year_t0, s_year_t1, type = "reverse")
#M <- sum(abs(apply(s_year_t1, 2, sum) - apply(s_year_t0, 2, sum)))
#mig_rate_yearly <- mig_rate_yearly +
# round(M * (s_year_t0) / (sum(s_year_t0) - sum(diag(s_year_t0))))
#temp_min <- my_min(s_year_t0, s_year_t1)
#hat_min_closed_yearly <- hat_min_closed_yearly + temp_min
#hat_pb_yearly <- hat_pb_yearly +
# 0.87 * temp_min + (1 - 0.87) * bayes(s_year_t0, s_year_t1)
# initialize
s_year_t0 <- s_year_t1
}
# diag(mig_rate_yearly) <- 0
# diag(hat_pb_yearly) <- 0
ind_h_diag <- cbind(rep(1:n, each = n),
rep(1:n, times = n))
ind_h_diag <- ind_h_diag[!(ind_h_diag[, 1] == ind_h_diag[, 2]), ]
# Error due to M for the migration rate method
my_M[b] <- M / sum(Y[ind_h_diag])
# metrics computed
RMAE <- c(((abs(Y - hat_drop)/Y))[ind_h_diag],
((abs(Y - hat_reverse)/Y))[ind_h_diag],
((abs(Y - hat_transport)/Y))[ind_h_diag],
((abs(Y - mig_rate)/Y))[ind_h_diag],
((abs(Y - hat_min_closed)/Y))[ind_h_diag],
((abs(Y - hat_pb)/Y))[ind_h_diag]
# ((abs(Y - hat_drop_yearly)/Y))[ind_h_diag],
# ((abs(Y - hat_reverse_yearly)/Y))[ind_h_diag],
# ((abs(Y - mig_rate_yearly)/Y))[ind_h_diag],
# ((abs(Y - hat_min_closed_yearly)/Y))[ind_h_diag],
# ((abs(Y - hat_pb_yearly)/Y))[ind_h_diag]
)
MAE <- c(((abs(Y - hat_drop)))[ind_h_diag],
((abs(Y - hat_reverse)))[ind_h_diag],
((abs(Y - hat_transport)))[ind_h_diag],
((abs(Y - mig_rate)))[ind_h_diag],
((abs(Y - hat_min_closed)))[ind_h_diag],
((abs(Y - hat_pb)))[ind_h_diag]
#((abs(Y - hat_drop_yearly)))[ind_h_diag],
# ((abs(Y - hat_reverse_yearly)))[ind_h_diag],
# ((abs(Y - mig_rate_yearly)))[ind_h_diag],
# ((abs(Y - hat_min_closed_yearly)))[ind_h_diag],
# ((abs(Y - hat_pb_yearly)))[ind_h_diag]
)
MSE <- c((((Y - hat_drop)^2))[ind_h_diag],
(((Y - hat_reverse)^2))[ind_h_diag],
(((Y - hat_transport)^2))[ind_h_diag],
(((Y - mig_rate)^2))[ind_h_diag],
(((Y - hat_min_closed)^2))[ind_h_diag],
(((Y - hat_pb)^2))[ind_h_diag]
#(((Y - hat_drop_yearly)^2))[ind_h_diag],
# (((Y - hat_reverse)^2))[ind_h_diag],
# (((Y - mig_rate_yearly)^2))[ind_h_diag],
# (((Y - hat_min_closed_yearly)^2))[ind_h_diag],
# (((Y - hat_pb_yearly)^2))[ind_h_diag]
)
RMSE <- c((((Y - hat_drop)^2)/Y)[ind_h_diag],
(((Y - hat_reverse)^2)/Y)[ind_h_diag],
(((Y - hat_transport)^2)/Y)[ind_h_diag],
(((Y - mig_rate)^2)/Y)[ind_h_diag],
(((Y - hat_min_closed)^2)/Y)[ind_h_diag],
(((Y - hat_pb)^2)/Y)[ind_h_diag]
# (((Y - hat_drop_yearly)^2)/Y)[ind_h_diag],
# (((Y - hat_reverse_yearly)^2)/Y)[ind_h_diag],
# (((Y - mig_rate_yearly)^2)/Y)[ind_h_diag],
# (((Y - hat_min_closed_yearly)^2)/Y)[ind_h_diag],
# (((Y - hat_pb_yearly)^2)/Y)[ind_h_diag]
)
estimation <- c(hat_drop[ind_h_diag], hat_reverse[ind_h_diag],
hat_transport[ind_h_diag], mig_rate[ind_h_diag],
hat_min_closed[ind_h_diag], hat_pb[ind_h_diag]
# hat_drop_yearly[ind_h_diag], hat_reverse_yearly[ind_h_diag],
# mig_rate_yearly[ind_h_diag], hat_min_closed_yearly[ind_h_diag],
# hat_pb_yearly[ind_h_diag]
)
sign_estimation <- c(((Y - hat_drop) > 0)[ind_h_diag],
((Y - hat_reverse) > 0)[ind_h_diag],
((Y - hat_transport) > 0)[ind_h_diag],
((Y - mig_rate) > 0)[ind_h_diag],
((Y - hat_min_closed) > 0)[ind_h_diag],
((Y - hat_pb) > 0)[ind_h_diag]
# ((Y - hat_drop_yearly) > 0)[ind_h_diag],
# ((Y - hat_reverse_yearly) > 0)[ind_h_diag],
# ((Y - mig_rate_yearly) > 0)[ind_h_diag],
# ((Y - hat_min_closed_yearly) > 0)[ind_h_diag],
# ((Y - hat_pb_yearly) > 0)[ind_h_diag]
)
compare_method <- cbind(((abs(Y - hat_drop)))[ind_h_diag],
((abs(Y - hat_reverse)))[ind_h_diag],
((abs(Y - hat_transport)))[ind_h_diag],
((abs(Y - mig_rate)))[ind_h_diag],
((abs(Y - hat_min_closed)))[ind_h_diag],
((abs(Y - hat_pb)))[ind_h_diag]
# ((abs(Y - hat_drop_yearly)))[ind_h_diag],
# ((abs(Y - hat_reverse_yearly)))[ind_h_diag],
# ((abs(Y - mig_rate_yearly)))[ind_h_diag],
# ((abs(Y - hat_min_closed_yearly)))[ind_h_diag],
# ((abs(Y - hat_pb_yearly)))[ind_h_diag]
)
best_pred[increment_best] <- apply(compare_method, 1, which.min)
mae_tab[b, ] <- colMeans(compare_method)
# mape
compare_method_2 <- cbind(((abs(Y - hat_drop) / Y))[ind_h_diag],
((abs(Y - hat_reverse) / Y))[ind_h_diag],
((abs(Y - hat_transport) / Y))[ind_h_diag],
((abs(Y - mig_rate) / Y))[ind_h_diag],
((abs(Y - hat_min_closed) / Y))[ind_h_diag],
((abs(Y - hat_pb) / Y))[ind_h_diag]
# ((abs(Y - hat_drop_yearly) / Y))[ind_h_diag],
# ((abs(Y - hat_reverse_yearly) / Y))[ind_h_diag],
# ((abs(Y - mig_rate_yearly) / Y))[ind_h_diag],
# ((abs(Y - hat_min_closed_yearly) / Y))[ind_h_diag],
# ((abs(Y - hat_pb_yearly) / Y))[ind_h_diag]
)
mape_tab[b, ] <- colMeans(compare_method_2)
# mse
compare_method_3 <- cbind((((Y - hat_drop)^2))[ind_h_diag],
(((Y - hat_reverse)^2))[ind_h_diag],
(((Y - hat_transport)^2))[ind_h_diag],
(((Y - mig_rate)^2))[ind_h_diag],
(((Y - hat_min_closed)^2))[ind_h_diag],
(((Y - hat_pb)^2))[ind_h_diag]
# (((Y - hat_drop_yearly)^2))[ind_h_diag],
# (((Y - hat_reverse_yearly)^2))[ind_h_diag],
# (((Y - mig_rate_yearly)^2))[ind_h_diag],
# (((Y - hat_min_closed_yearly)^2))[ind_h_diag],
# (((Y - hat_pb_yearly)^2))[ind_h_diag]
)
rmse_tab[b, ] <- sqrt(colMeans(compare_method_3))
res[increment, ] <- data.frame(Method = rep(names_method_esti, each = n * (n - 1)),
outwards = mat_outwards[ind_h_diag],
returns = mat_returns[ind_h_diag],
transit = mat_transit[ind_h_diag],
MAE = MAE,
estimation = estimation,
sign_estimation = c("Over-", "Under-")[sign_estimation + 1],
Y = Y[ind_h_diag],
RMAE = RMAE,
MSE = MSE,
RMSE = RMSE)
increment <- increment + nb_method_esti * n * (n-1)
increment_best <- increment_best + n * (n-1)
}
)
# scenario
res$scenario <- ""
res$scenario[res$outwards > res$returns & res$outwards > res$transit ] <- "Outwards >> Returns, Transit"
res$scenario[res$returns > res$outwards & res$returns > res$transit ] <- "Returns >> Outwards, Transit"
res$scenario[res$transit > res$outwards & res$transit > res$returns ] <- "Transit >> Outwards, Returns"
# proportion of sign
res$sign_estimation <- substr(res$sign_estimation, 1,
nchar(res$sign_estimation) - 1)
mape_tab <- as.data.frame(mape_tab)
colnames(mape_tab) <- names_method_esti
mae_tab <- as.data.frame(mae_tab)
colnames(mae_tab) <- names_method_esti
rmse_tab <- as.data.frame(rmse_tab)
colnames(rmse_tab) <- names_method_esti
# save
save(res, my_M, best_pred, mape_tab, mae_tab, rmse_tab,
file = "results/res_simu.RData")
# save(res, my_M, best_pred, mape_tab, mae_tab, rmse_tab,
# file = "results/res_simu_yearly.RData")5.3 Results
We represent the parallel boxplots of the three evaluation metrics—MAE, MAPE, and RMSE—across the different methods, based on the 100 simulation replications (figure 2 in the article) :
load(file = "results/res_simu.RData")
res_tab <- rbind(mape_tab, mae_tab, rmse_tab) %>%
pivot_longer(cols = 1:nb_method_esti, names_to = "Method", values_to = "Values") %>%
mutate(Metric = rep(c("MAPE", "MAE", "RMSE"), each = nb_rep * nb_method_esti))## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(nb_method_esti)
##
## # Now:
## data %>% select(all_of(nb_method_esti))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
res_tab$Method[res_tab$Method == "mini."] <- "IPFP"
res_tab$Method <- factor(res_tab$Method,
levels = c("drop", "reverse", "mig. rate", "IPFP", "PB", "OT"))
ggplot(res_tab, aes(x = Method, y = Values)) +
geom_boxplot() +
theme(axis.text.x=element_text(angle = 90)) +
facet_wrap(~Metric, scales = "free_y", ncol = 3)We compute the average values of the evaluation indicators—BPR, UND, MAE, MAPE, and RMSE—computed across the \(B\) simulation replications. Values highlighted in red represent the best performance for each metric, while those in orange indicate the second-best performance (Table 1 in the article):
res$Method[res$Method == "minimization"] <- "IPFP"
res$Method[res$Method == "migr. rate"] <- "mig. rate"
res$Method <- factor(res$Method,
levels = c("drop", "reverse", "mig. rate", "IPFP", "PB", "OT"))
res_tab <- res %>%
group_by(Method) %>%
summarise(
UND = round(mean(estimation < Y) * 100, 2),
MAE = mean(MAE),
MAPE = round(100 * mean(RMAE), 2),
RMSE = sqrt(mean(MSE)))
res_tab <- merge(
data.frame(BPR = round(100 * as.numeric(table(best_pred)) /
sum(as.numeric(table(best_pred))), 2),
Method = names_method_esti),
res_tab,
by = "Method")
row.names(res_tab) <- res_tab$Method
t(round(res_tab[c("drop", "reverse", "mig. rate", "IPFP", "PB", "OT"), -1], 2))## drop reverse mig. rate IPFP PB OT
## BPR 7.16 0.25 10.78 4.91 20.00 56.90
## UND 88.03 87.99 85.26 86.14 76.85 61.09
## MAE 1990.03 1942.25 2322.42 1978.24 1386.65 1343.06
## MAPE 73.11 72.70 99.76 82.19 65.60 61.60
## RMSE 44102.33 43628.36 97346.71 44021.50 38400.40 39818.08
We represent the scatter plots of the estimated flows \(\hat{F}_{ij}\) versus the observed flows \(F_{ij}\) for each method (drop negative, migration rate, minimization, Optimal Transport, Pseudo-Bayes, and reverse negative). The lower panel zooms in on the smallest flows (figure 10 in the article)
set.seed(1)
choose_ind <- sample(1:100, size = 5)
temp <- sapply(choose_ind, function(x) ((x - 1) * ( n * (n - 1) * 6) + 1):(x * (6 * n * (n - 1)) ))
res_1_simu <- res[temp, ]
p <- ggplot(res_1_simu, aes(x = Y, y = estimation, color = sign_estimation)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "red") +
labs(color = "Over/Under estimate") +
facet_wrap(~Method)
pp <- ggplot(res_1_simu, aes(x = Y, y = estimation, color = sign_estimation)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "red") +
labs(color = "Over/Under estimate") +
facet_wrap(~Method) +
xlim(0, 300000) +
ylim(0, 300000)
#ggsave("figures/scatterplot_2.jpeg", width = 12, height = 7, plot = p)We represent the boxplots of the absolute relative error, \(\frac{|F_{ij} - \hat{F}^{(m)}_{ij}|}{F_{ij}}\), computed over five simulations, stratified by the dominant component of each true flow: Outwards, Returns, or Transits (figure 11 in the article)
res_1_simu %>%
ggplot() +
aes(x = Method, y = RMAE) +
geom_boxplot() +
ylim(0, 2) +
ylab(latex2exp::TeX(r'($(|F_{ij}-\hat{F}_{ij}|)/F_{ij}$)')) +
theme(axis.text.x=element_text(angle = 90)) +
facet_wrap(~scenario) 6 Estimation procedure
The final dataset includes:
originanddest(origin and destination countries),period(five-year period),type(sex disaggregation).
Migration estimates are computed using UN stock data under different adjustment schemes:
Using Raw data:
hat_drop: drop negative from raw data,hat_reverse: reverse negative from raw data,hat_mig_rate: migration rate from raw data,
Using Open Adjusted data:
hat_drop_open: drop negative on open adjusted data,hat_reverse_open: reverse negative on open adjusted data,hat_mig_rate_open: migration rate on open adjusted data,hat_min_open: minimization method on open adjusted data,
Using Closed Adjusted data: * hat_drop_closed: drop
negative on closed adjusted data, * hat_reverse_closed:
reverse negative on closed adjusted data, *
hat_mig_rate_closed: migration rate on closed adjusted
data, * hat_min_closed: minimization method on closed
adjusted data, * hat_pb: pseudo-Bayes method on closed
adjusted data.
our_estimates_1y <- data.frame(
origin = character(0),
dest = character(0),
year = character(0),
type = character(0),
# Open
OT_open = numeric(0),
OT_open_outwards = numeric(0),
OT_open_returns = numeric(0),
OT_open_transits = numeric(0),
# closed
OT_closed = numeric(0),
OT_closed_outwards = numeric(0),
OT_closed_returns = numeric(0),
OT_closed_transits = numeric(0)
)
our_estimates_5y <- data.frame(
origin = character(0),
dest = character(0),
year = character(0),
type = character(0),
# Open
hat_transport_open_outwards = numeric(0),
hat_transport_open_returns = numeric(0),
hat_transport_open_transits = numeric(0),
# closed
hat_transport_closed_outwards = numeric(0),
hat_transport_closed_returns = numeric(0),
hat_transport_closed_transits = numeric(0)
)
all_estimates <- data.frame(
origin = character(0),
dest = character(0),
year = character(0),
type = character(0),
# No adjustment
hat_drop = numeric(0),
hat_reverse = numeric(0),
hat_mig_rate = numeric(0),
# Open
hat_drop_open = numeric(0),
hat_reverse_open = numeric(0),
hat_mig_rate_open = numeric(0),
hat_min_open = numeric(0),
hat_transport_open = numeric(0),
# closed
hat_drop_closed = numeric(0),
hat_reverse_closed = numeric(0),
hat_mig_rate_closed = numeric(0),
hat_min_closed = numeric(0),
hat_pb = numeric(0),
hat_transport_closed = numeric(0)
)years <- c(seq(1990, 2020, by = 5), 2024)
n <- 230
ptm <- proc.time()
for (k in 1:(length(years) - 1)) {
my_period <- paste0(years[k], "-", years[k + 1])
cat("Period: ", my_period, "\n")
for (l in c("tot", "m", "f")) {
cat(" Sex: ", l, "\n")
# import stock table
load(file = paste0("results/stock_", l, "_", years[k], ".RData"))
# Estimates
# 1/ Tables without adjustment : drop, reverse and migration rate
## drop and reverse
hat_drop <- my_diff(s1, s2, type = "drop")
hat_reverse <- my_diff(s1, s2, type = "reverse")
## migration rate
# compute the total number M of migrant
# it is first adjusted such that the sum of the elements of the vector equals 0 (rescale_net function)
M <- sum(abs(migest::rescale_net(demo[demo$period == my_period,
paste0(l, "_", "net")])))
hat_mig_rate <- round(M * s1 / (sum(s1) - sum(diag(s1))))
diag(hat_mig_rate) <- 0
# 2/ Tables with open adjusted data
hat_drop_open <- my_diff(s1_open, s2_open, type = "drop")
hat_reverse_open <- my_diff(s1_open, s2_open, type = "reverse")
hat_mig_rate_open <- round(M * s1_open / (sum(s1_open) - sum(diag(s1_open))))
diag(hat_mig_rate_open) <- 0
hat_min_open <- my_min(s1_open, s2_open)
# 3/ Tables with closed adjusted data
hat_drop_closed <- my_diff(s1_closed, s2_closed, type = "drop")
hat_reverse_closed <- my_diff(s1_closed, s2_closed, type = "reverse")
hat_mig_rate_closed <- round(M * s1_closed / (sum(s1_closed) - sum(diag(s1_closed))))
diag(hat_mig_rate_closed) <- 0
hat_min_closed <- my_min(s1_closed, s2_closed)
hat_pb <- 0.87 * hat_min_closed + (1 - 0.87) * bayes(s1_closed, s2_closed)
diag(hat_pb) <- 0
##################
### Methodology for OT
####. closed and OPEN at the same time
s_year_0_closed <- s1_closed
hat_transport_closed <- outwards_closed <- returns_closed <- transit_closed <- numeric(n^2)
s_year_0_open <- s1_open
hat_transport_open <- outwards_open <- returns_open <- transit_open <- numeric(n^2)
for(t_0 in 1:5) {
if(years[k] == 2020 & t_0 == 5)
break
##############
# closed
s_year_1_closed = s1_closed + t_0 * (s2_closed - s1_closed) /
ifelse(years[k] != 2020, 5, 4)
new_data <- data.frame(
stock_0 = as.vector(s_year_0_closed),
stock_1 = as.vector(s_year_1_closed),
birth = rep(row.names(s_year_0_closed), n),
residence = rep(row.names(s_year_0_closed), each = n))
hat_outflows <- predict(nls_outflows[["closed"]], newdata = new_data)
hat_inflows <- predict(nls_inflows[["closed"]], newdata = new_data)
temp_time_closed <- my_transport(new_data$stock_0, new_data$stock_1,
new_data$birth, new_data$residence,
hat_outflows, hat_inflows,
rate_returns = 0.6424708)
hat_transport_closed <- hat_transport_closed + as.vector(temp_time_closed$res)
outwards_closed <- outwards_closed + as.vector(temp_time_closed$outwards)
returns_closed <- returns_closed + as.vector(temp_time_closed$returns)
transit_closed <- transit_closed + as.vector(temp_time_closed$transit)
############
# open
s_year_1_open = s1_open + t_0 * (s2_open - s1_open) /
ifelse(years[k] != 2020, 5, 4)
new_data <- data.frame(
stock_0 = as.vector(s_year_0_open),
stock_1 = as.vector(s_year_1_open),
birth = rep(row.names(s_year_0_open), n),
residence = rep(row.names(s_year_0_open), each = n))
hat_outflows <- predict(nls_outflows[["open"]], newdata = new_data)
hat_inflows <- predict(nls_inflows[["open"]], newdata = new_data)
temp_time_open <- my_transport(new_data$stock_0, new_data$stock_1,
new_data$birth, new_data$residence,
hat_outflows, hat_inflows,
rate_returns = 0.6757961)
hat_transport_open <- hat_transport_open + as.vector(temp_time_open$res)
outwards_open <- outwards_open + as.vector(temp_time_open$outwards)
returns_open <- returns_open + as.vector(temp_time_open$returns)
transit_open <- transit_open + as.vector(temp_time_open$transit)
our_estimates_1y <- rbind(our_estimates_1y, data.frame(
origin = rep(country$iso, n),
dest = rep(country$iso, each = n),
year = years[k] + t_0,
type = rep(l, n * n),
# Open
OT_open = as.vector(temp_time_open$res),
OT_open_outwards = as.vector(temp_time_open$outwards),
OT_open_returns = as.vector(temp_time_open$returns),
OT_open_transits = as.vector(temp_time_open$transit),
# closed
OT_closed = as.vector(temp_time_closed$res),
OT_closed_outwards = as.vector(temp_time_closed$outwards),
OT_closed_returns = as.vector(temp_time_closed$returns),
OT_closed_transits = as.vector(temp_time_closed$transit)
)
)
# re-initialisation
s_year_0_closed <- s_year_1_closed
s_year_0_open <- s_year_1_open
}
#######
all_estimates <- rbind(all_estimates, data.frame(
origin = rep(country$iso, n),
dest = rep(country$iso, each = n),
year = paste0(years[k], "-", years[k + 1]),
type = rep(l, n * n),
# No adjustment
hat_drop = as.vector(hat_drop),
hat_reverse = as.vector(hat_reverse),
hat_mig_rate = as.vector(hat_mig_rate),
# hat_transport = as.vector(hat_transport),
# Open
hat_drop_open = as.vector(hat_drop_open),
hat_reverse_open = as.vector(hat_reverse_open),
hat_mig_rate_open = as.vector(hat_mig_rate_open),
hat_min_open = as.vector(hat_min_open),
hat_transport_open = as.vector(hat_transport_open),
# closed
hat_drop_closed = as.vector(hat_drop_closed),
hat_reverse_closed = as.vector(hat_reverse_closed),
hat_mig_rate_closed = as.vector(hat_mig_rate_closed),
hat_min_closed = as.vector(hat_min_closed),
hat_transport_closed = as.vector(hat_transport_closed),
hat_pb = as.vector(hat_pb)
))
our_estimates_5y <- rbind(our_estimates_5y, data.frame(
origin = rep(country$iso, n),
dest = rep(country$iso, each = n),
year = paste0(years[k], "-", years[k + 1]),
type = rep(l, n * n),
# Open
hat_transport_open_outwards = as.vector(outwards_open),
hat_transport_open_returns = as.vector(returns_open),
hat_transport_open_transits = as.vector(transit_open),
# closed
hat_transport_closed_outwards = as.vector(outwards_closed),
hat_transport_closed_returns = as.vector(returns_closed),
hat_transport_closed_transits = as.vector(transit_closed)
)
)
}
}6.1 Comparison of estimation methods and data consistency
Here, we aim to assess whether our results align with those published by Abel and Cohen. For this comparison, we use the version of their dataset dated 2025-03-26, 06:16, available at: https://figshare.com/articles/dataset/Bilateral_international_migration_flow_estimates_for_200_countries_1990-1995_to_2010-2015_/7731233?file=53235671.
abel <- read.csv("data/bilat_mig.csv")
abel$year0 <- paste0(abel$year0, "-", abel$year0 + 5)
abel_long <- pivot_longer(abel, cols = 4:9, names_to = "method", values_to = "abel_estimate")
our_estimates_long <- all_estimates %>%
filter(type == "tot") %>%
dplyr::select(origin, dest, year, hat_drop, hat_reverse, hat_mig_rate,
hat_min_open, hat_min_closed, hat_pb) %>%
rename(sd_drop_neg = 4, sd_rev_neg = 5, mig_rate = 6, da_min_open = 7,
da_min_closed = 8, da_pb_closed = 9) %>%
pivot_longer(cols = 4:9, names_to = "method", values_to = "our_estimate")
# merge the two data bases
our_estimates_long <- merge(our_estimates_long, abel_long, by.x = c("origin", "dest", "year", "method"),
by.y = c("orig", "dest", "year0", "method"))We plot the correlation between our estimate and abel’s (Figure 12 in the article):
our_estimates_long$method[our_estimates_long$method == "da_min_closed"] <- "IPFP (closed)"
our_estimates_long$method[our_estimates_long$method == "da_min_open"] <- "IPFP (open)"
our_estimates_long$method[our_estimates_long$method == "da_pb_closed"] <- "PB"
our_estimates_long$method[our_estimates_long$method == "mig_rate"] <- "mig. rate"
our_estimates_long$method[our_estimates_long$method == "sd_drop_neg"] <- "drop"
our_estimates_long$method[our_estimates_long$method == "sd_rev_neg"] <- "reverse"
our_estimates_long$method <- factor(our_estimates_long$method,
levels = c("drop", "reverse", "mig. rate", "IPFP (open)", "IPFP (closed)", "PB"))
our_estimates_long %>%
ggplot() +
aes(x = abel_estimate, y = our_estimate) +
geom_point() +
xlab("Estimates produced by Guy J. Abel") +
ylab("Estimates produced by the Authors") +
geom_abline(slope = 1, intercept = 0) +
facet_wrap(~ method)We compute the mean/median of the absolute differences :
our_estimates_long %>%
mutate(diff = abs(abel_estimate - our_estimate)) %>%
group_by(method) %>%
summarise(diff_mean = mean(diff),
diff_med = median(diff)) %>%
arrange(-diff_mean)## # A tibble: 6 × 3
## method diff_mean diff_med
## <fct> <dbl> <dbl>
## 1 IPFP (closed) 108. 0
## 2 PB 107. 0.03
## 3 mig. rate 10.0 0
## 4 IPFP (open) 6.03 0
## 5 drop 0 0
## 6 reverse 0 0
We observe differences between the two versions, which can be attributed to database updates and the increased significance of the “unknown” category in the latest data version. Moreover, the closed demographic adjustment and Pseudo-Bayes estimates show the greatest divergence from our results.
Note : Some of these differences may appear significant, particularly the flow from Germany to the United States between 1990 and 1995. This flow was very small in the previous version of Abel and Cohen, but is notably larger in the current version. This is due to the high stock of Germans in the U.S. in 1995 (2,261,086), which is much higher compared to 1990 (1,465,599) and 2000 (1,369,572). The other flows that differ correspond to denser migration corridors (from countries like India or China or Mexico to the USA), for which it can be assumed that adjustments were made based on updated population censuses.
We plot the total Migration Flows (in millions) and Crude Migration Rates (migrants per thousand people in the population) (Figure 3 in the article) :
wpp2024_pop <- wpp2024 %>% filter(`Region, subregion, country or area *` == "World") %>%
select(11, 13) %>%
filter(Year %in% c(1990, 1995, 2000, 2005, 2010, 2015, 2020)) %>%
mutate(Period = paste0(Year, "-", Year + ifelse(Year == 2020, 4, 5))) %>%
rename(Pop = 2) %>%
mutate(Pop = as.numeric(Pop))
our_estimates_long <- all_estimates %>%
filter(type == "tot") %>%
pivot_longer(cols = 5:18, names_to = "Method", values_to = "Flows")
temp_long <- our_estimates_long %>%
group_by(year, Method) %>%
summarise(Flows = sum(Flows)) %>%
mutate(Period = factor(year),
Method = substr(Method, 5, nchar(Method))) %>%
mutate(Method = gsub(pattern = "mig_", replacement = "mig. ", Method),
Method = gsub(pattern = "pb", replacement = "PB_closed", Method),
Method = gsub(pattern = "transport", replacement = "OT", Method)) %>%
separate(col = Method, into = c("Method", "stock"), sep = "_") %>%
mutate(stock = ifelse(is.na(stock), "raw", stock)) ## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 21 rows [1, 4, 10, 15,
## 18, 24, 29, 32, 38, 43, 46, 52, 57, 60, 66, 71, 74, 80, 85, 88, ...].
temp_long$type <- "Sum of flows \n (in Millions)"
temp_long_2 <- merge(temp_long, wpp2024_pop, by = "Period")
temp_long_2$Flows <- temp_long_2$Flows / temp_long_2$Pop
temp_long_2$type <- "Crude Migration Rate \n (per thousand population)"
temp_long$Flows <- temp_long$Flows / 10^6
temp_long <- rbind(temp_long, temp_long_2)
temp_long$type <- factor(temp_long$type,
levels = c("Sum of flows \n (in Millions)",
"Crude Migration Rate \n (per thousand population)"))
temp_long$Method[temp_long$Method == "min"] <- "IPFP"
temp_long$Method <- factor(temp_long$Method,
levels = c("drop", "reverse", "mig. rate", "IPFP", "PB", "OT"))
ggplot(temp_long, aes(x = Period, y = Flows, color = Method, group = Method)) +
geom_line() +
theme(axis.text.x=element_text(angle = 90)) +
# scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6)) +
facet_grid(type ~ stock, scales = "free_y")We represent the Summary statistics of the estimates (Table 2 in the article):
stargazer::stargazer(all_estimates[all_estimates$type == "tot",],
type = "latex", median = TRUE,
align = T, header = F, digits = 0,
title = " Summary statistics of the samples")##
## \begin{table}[!htbp] \centering
## \caption{ Summary statistics of the samples}
## \label{}
## \begin{tabular}{@{\extracolsep{5pt}}lD{.}{.}{-0} D{.}{.}{-0} D{.}{.}{-0} D{.}{.}{-0} D{.}{.}{-0} D{.}{.}{-0} }
## \\[-1.8ex]\hline
## \hline \\[-1.8ex]
## Statistic & \multicolumn{1}{c}{N} & \multicolumn{1}{c}{Mean} & \multicolumn{1}{c}{St. Dev.} & \multicolumn{1}{c}{Min} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Max} \\
## \hline \\[-1.8ex]
## hat\_drop & 368,690 & 539 & 13,194 & 0 & 0 & 2,559,224 \\
## hat\_reverse & 368,690 & 695 & 15,263 & 0 & 0 & 2,559,224 \\
## hat\_mig\_rate & 368,690 & 1,234 & 24,115 & 0 & 0 & 4,603,924 \\
## hat\_drop\_open & 368,690 & 626 & 13,904 & 0 & 0 & 2,652,783 \\
## hat\_reverse\_open & 368,690 & 732 & 15,328 & 0 & 0 & 2,652,783 \\
## hat\_mig\_rate\_open & 368,690 & 1,234 & 24,105 & 0 & 0 & 4,631,496 \\
## hat\_min\_open & 368,690 & 668 & 14,696 & 0 & 0 & 2,650,984 \\
## hat\_transport\_open & 368,690 & 1,451 & 19,196 & 0 & 0 & 2,930,110 \\
## hat\_drop\_closed & 368,690 & 779 & 16,614 & 0 & 0 & 2,019,930 \\
## hat\_reverse\_closed & 368,690 & 1,025 & 19,495 & 0 & 0 & 2,058,246 \\
## hat\_mig\_rate\_closed & 368,690 & 1,234 & 23,895 & 0 & 0 & 4,650,990 \\
## hat\_min\_closed & 368,690 & 879 & 17,137 & 0 & 0 & 2,062,016 \\
## hat\_transport\_closed & 368,690 & 1,726 & 22,324 & 0 & 1 & 2,285,943 \\
## hat\_pb & 368,690 & 1,664 & 24,325 & 0 & 0 & 3,035,397 \\
## \hline \\[-1.8ex]
## \end{tabular}
## \end{table}
# proportion zero
temp_summary <- all_estimates[all_estimates$type == "tot",]
sapply(temp_summary, function(x) round(100 * length(which(x == 0)) / length(x), 2))## origin dest year
## 0.00 0.00 0.00
## type hat_drop hat_reverse
## 0.00 88.48 85.59
## hat_mig_rate hat_drop_open hat_reverse_open
## 84.17 87.16 85.16
## hat_mig_rate_open hat_min_open hat_transport_open
## 84.17 78.79 16.58
## hat_drop_closed hat_reverse_closed hat_mig_rate_closed
## 88.62 85.30 84.17
## hat_min_closed hat_transport_closed hat_pb
## 74.49 16.58 52.50
We plot figure 13 :
# shares by year
shares <- aggregate(our_estimates[our_estimates$type == "tot", -c(1, 2, 3, 4)],
by = list(year = our_estimates$year[our_estimates$type == "tot"]),
FUN = sum)
shares[, c(2, 3, 4)] <- shares[, c(2, 3, 4)] / rowSums(shares[, c(2, 3, 4)])
shares[, c(5, 6, 7)] <- shares[, c(5, 6, 7)] / rowSums(shares[, c(5, 6, 7)])
shares_pivot <- shares %>%
pivot_longer(cols = -1, names_to = "compo",
values_to = "share") %>%
separate(col = compo, into = c("hat", "transport", "Method", "compo"), sep = "_") %>%
select(-hat, -transport)
shares_pivot$year <- factor(shares_pivot$year)
ggplot(shares_pivot, aes(x = year, y = share, color = compo, group = compo)) +
geom_line() +
facet_wrap(~ Method) +
theme(axis.text.x = element_text(angle = 90)) +
ylab("Percentage of Total Flows")ggsave("figures/compo.pdf", width = 8, height = 3.5)
# global shares
shares_tot <- aggregate(our_estimates[our_estimates$type == "tot", -c(1, 2, 3, 4)],
by = list(year = our_estimates$type[our_estimates$type == "tot"]),
FUN = sum)
shares_tot[, c(2, 3, 4)] <- shares_tot[, c(2, 3, 4)] / rowSums(shares_tot[, c(2, 3, 4)])
shares_tot[, c(5, 6, 7)] <- shares_tot[, c(5, 6, 7)] / rowSums(shares_tot[, c(5, 6, 7)]) We give the highest flows for returns and for the transits:
our_estimates %>%
filter(type == "tot", year == "2020-2024") %>%
arrange(-hat_transport_open_returns) %>%
head(3)## origin dest year type hat_transport_open_outwards
## 1 TUR SYR 2020-2024 tot 0.00
## 2 RUS KAZ 2020-2024 tot 38605.29
## 3 USA MEX 2020-2024 tot 103576.15
## hat_transport_open_returns hat_transport_open_transits
## 1 610832.6 4079.45
## 2 319408.1 41495.13
## 3 233627.7 55355.22
## hat_transport_closed_outwards hat_transport_closed_returns
## 1 0.00 1063293.56
## 2 39291.51 77611.23
## 3 86206.56 216240.78
## hat_transport_closed_transits
## 1 16212.90
## 2 30174.92
## 3 65734.22
our_estimates %>%
filter(type == "tot", year == "2020-2024") %>%
arrange(-hat_transport_closed_returns) %>%
head(3)## origin dest year type hat_transport_open_outwards
## 1 TUR SYR 2020-2024 tot 0.00
## 2 PAK AFG 2020-2024 tot 17206.95
## 3 PAK IND 2020-2024 tot 63107.83
## hat_transport_open_returns hat_transport_open_transits
## 1 610832.56 4079.4501
## 2 91825.75 0.0000
## 3 89941.52 133.2665
## hat_transport_closed_outwards hat_transport_closed_returns
## 1 0.00 1063293.6
## 2 27954.33 575841.7
## 3 337717.86 525836.5
## hat_transport_closed_transits
## 1 16212.8987
## 2 0.0000
## 3 794.6368
our_estimates %>%
filter(type == "tot", year == "2020-2024") %>%
arrange(-hat_transport_open_transits) %>%
head(3)## origin dest year type hat_transport_open_outwards
## 1 USA ESP 2020-2024 tot 18919.34
## 2 USA CAN 2020-2024 tot 22336.27
## 3 CAN USA 2020-2024 tot 103754.71
## hat_transport_open_returns hat_transport_open_transits
## 1 13062.94 120391.28
## 2 69994.91 88692.07
## 3 20746.12 88238.27
## hat_transport_closed_outwards hat_transport_closed_returns
## 1 15018.80 12359.41
## 2 22702.47 69521.72
## 3 85980.79 19600.31
## hat_transport_closed_transits
## 1 110984.3
## 2 105334.0
## 3 104070.4
our_estimates %>%
filter(type == "tot", year == "2020-2024") %>%
arrange(-hat_transport_closed_transits) %>%
head(3)## origin dest year type hat_transport_open_outwards
## 1 PAK IRN 2020-2024 tot 1982.297
## 2 TUR JOR 2020-2024 tot 0.000
## 3 USA ESP 2020-2024 tot 18919.339
## hat_transport_open_returns hat_transport_open_transits
## 1 177.4273 32376.84
## 2 1533.5390 48352.54
## 3 13062.9380 120391.28
## hat_transport_closed_outwards hat_transport_closed_returns
## 1 9831.122 1207.150
## 2 0.000 1596.761
## 3 15018.799 12359.411
## hat_transport_closed_transits
## 1 239427.4
## 2 135084.8
## 3 110984.3
We plot the boxplot of the various migration flow estimates (log scale), grouped by type of stock table used (raw, open, closed) (figure 14 in the article):
our_estimates_long %>%
mutate(Method = substr(Method, 5, nchar(Method))) %>%
mutate(Method = gsub(pattern = "mig_", replacement = "mig. ", Method),
Method = gsub(pattern = "pb", replacement = "PB_closed", Method),
Method = gsub(pattern = "min", replacement = "IPFP", Method),
Method = gsub(pattern = "transport", replacement = "OT", Method)) %>%
separate(col = Method, into = c("Method", "stock"), sep = "_") %>%
mutate(stock = ifelse(is.na(stock), "raw", stock)) %>%
mutate(stock = factor(stock, levels = c("raw", "open", "closed")),
Method = factor(Method,
levels = c("drop", "reverse", "mig. rate", "IPFP", "PB", "OT"))) %>%
ggplot(aes(x = Method, y = 1 + Flows)) +
geom_boxplot() +
ylab("Estimated Flows (in log scale)") +
xlab("Method") +
scale_y_log10() +
theme_minimal() +
theme(axis.text.x=element_text(angle = 90)) +
theme(aspect.ratio = 1) +
facet_grid( ~ stock, scales = "free")
ggsave("figures/box_estimates.jpeg", width = 8, height = 3.5)6.1.1 Collection of Migration Statistics
In this subsection, we compile migration data from various publicly available sources in order to compare our estimation results with existing datasets. This corresponds to Section 6.3 of the article.
- DEMIG C2C data (Source: https://www.migrationinstitute.org/publications/wp-88-14)
names_est <- c("hat_drop", "hat_reverse", "hat_mig_rate",
"hat_drop_open", "hat_reverse_open", "hat_mig_rate_open", "hat_min_open",
"hat_transport_open",
"hat_drop_closed", "hat_reverse_closed", "hat_mig_rate_closed", "hat_min_closed",
"hat_transport_closed", "hat_pb", "Y")
# to keep the same evaluation points than ABel and Cohen
country_validation <- read.csv("data/country_period_validation.csv")
# the data
my_path <- "data/demig-c2c-migration-flow-database-version-1-2_limited-online-edition.xlsx"
# the population data
temp_pop <- demo %>%
mutate(iso3 = countrycode::countrycode(Location,
origin = "country.name", "iso3c")) %>%
select(iso3, period, tot_pop_t0) %>%
rename(pop = 3)
# Demig C2C
countries_of_interset_demig <- c("AUT", "BEL", "CZE", "DNK", "FIN", "DEU",
"ISL", "NZL", "POL", "SVN", "ESP", "SWE", "SVK", "ZAF", "GBR")
#####
#### Outflows
demig_outflow_read <- readxl::read_xlsx(path = my_path, sheet = 3)
demig_outflow <- demig_outflow_read %>%
filter(Criterion == "COR",
Year %in% (1990:2024),
Gender == "Total") %>%
filter(`Coverage -Citizens/Foreigners/Both-` == "Both") %>%
mutate(weight = ifelse(Year %in% c(1990, 1995, 2000, 2005, 2010, 2015), 0.5, 1))
# aggregate by period of five years
demig_outflow_agg <- data.frame(origin = character(), dest = character(),
period = character(), Y = numeric())
for(p in 1990:2015) {
demig_outflow_agg <- rbind(demig_outflow_agg,
demig_outflow %>%
filter(Year %in% p:(p+5)) %>%
group_by(`Reporting country`, `Country codes -UN based-`) %>%
summarise(Y = sum(Value * weight),
weight = sum(weight)) %>%
mutate(Y = 5 * Y / weight,
period = paste0(p, "-", p+5)) %>%
rename(origin = 1, dest = 2)
)
}
demig_outflow_agg$origin <- countrycode(demig_outflow_agg$origin,
"country.name", "iso3c")
demig_outflow_agg <- demig_outflow_agg %>%
filter(!is.na(dest)) %>%
filter(!is.na(origin)) %>%
filter(!is.na(Y)) %>%
select(-weight)
# merge with the estimates
demig_outflow_agg <- merge(all_estimates[all_estimates$type == "tot", ],
demig_outflow_agg[, c("origin", "dest", "period", "Y")],
by.x = c("origin", "dest", "year"),
by.y = c("origin", "dest", "period"))
# compute total outflows
demig_outflow_agg_prop <- demig_outflow_agg %>%
group_by(origin, year) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE))
colnames(demig_outflow_agg_prop)[-c(1, 2)] <-
paste0(colnames(demig_outflow_agg_prop[, -c(1, 2)]), "_tot")
# we avoid to divide by 0
demig_outflow_agg_prop[demig_outflow_agg_prop == 0] <- 1
demig_outflow_agg <- merge(demig_outflow_agg, demig_outflow_agg_prop,
by = c("origin", "year"))
demig_outflow_agg[, paste0(names_est, "_tot")] <- demig_outflow_agg[, names_est] /
demig_outflow_agg[, paste0(names_est, "_tot")]
############
#### Inflows
demig_inflow_read <- readxl::read_xlsx(path = my_path, sheet = 2)
demig_inflow <- demig_inflow_read %>%
filter(Criterion == "COR",
Year %in% (1990:2024),
Gender == "Total") %>%
filter(`Coverage -Citizens/Foreigners/Both-` == "Both") %>%
mutate(weight = ifelse(Year %in% c(1990, 1995, 2000, 2005, 2010, 2015), 0.5, 1))
# aggregate by period of five years
demig_inflow_agg <- data.frame(origin = character(), dest = character(),
period = character(), Y = numeric())
for(p in 1990:2015) {
demig_inflow_agg <- rbind(demig_inflow_agg,
demig_inflow %>%
filter(Year %in% p:(p+5)) %>%
group_by(`Reporting country`, `Country codes -UN based-`) %>%
summarise(Y = sum(Value * weight),
weight = sum(weight)) %>%
mutate(Y = 5 * Y / weight,
period = paste0(p, "-", p+5)) %>%
rename(origin = 2, dest = 1)
)
}
demig_inflow_agg$dest <- countrycode(demig_inflow_agg$dest,
"country.name", "iso3c")
demig_inflow_agg <- demig_inflow_agg %>%
filter(!is.na(dest)) %>%
filter(!is.na(origin)) %>%
filter(!is.na(Y)) %>%
select(-weight)
# merge with the estimates
demig_inflow_agg <- merge(all_estimates[all_estimates$type == "tot", ],
demig_inflow_agg[, c("origin", "dest", "period", "Y")],
by.x = c("origin", "dest", "year"),
by.y = c("origin", "dest", "period"))
# compute total inflows
demig_inflow_agg_prop <- demig_inflow_agg %>%
group_by(dest, year) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE))
colnames(demig_inflow_agg_prop)[-c(1, 2)] <-
paste0(colnames(demig_inflow_agg_prop[, -c(1, 2)]), "_tot")
demig_inflow_agg_prop[demig_inflow_agg_prop == 0] <- 1
demig_inflow_agg <- merge(demig_inflow_agg, demig_inflow_agg_prop,
by = c("dest", "year"))
demig_inflow_agg[, paste0(names_est, "_tot")] <- demig_inflow_agg[, names_est] /
demig_inflow_agg[, paste0(names_est, "_tot")]
# Combine
compare_mig <- rbind(demig_inflow_agg, demig_outflow_agg)
# emigration / immigration
# emigration
compare_emig_2 <- merge(demig_outflow_agg_prop, temp_pop, by.x = c("origin", "year"),
by.y = c("iso3", "period"))
compare_emig_2[, names_est] <- compare_emig_2[, paste0(names_est, "_tot")] /
compare_emig_2[, "pop"]
# we only keep the 14 countries
compare_emig_2 <- compare_emig_2 %>% filter(origin %in% countries_of_interset_demig)
# immigration
compare_imig_2 <- merge(demig_inflow_agg_prop, temp_pop, by.x = c("dest", "year"),
by.y = c("iso3", "period"))
compare_imig_2[, names_est] <- compare_imig_2[, paste0(names_est , "_tot")] /
compare_imig_2[, "pop"]
# we only keep the 14 countries
compare_imig_2 <- compare_imig_2 %>% filter(dest %in% countries_of_interset_demig)
# net
net_mig <- merge(demig_inflow_agg_prop, demig_outflow_agg_prop,
by.x = c("dest", "year"),
by.y = c("origin", "year"))
net_mig[, names_est] <- net_mig[, paste0(names_est, "_tot.x")] -
net_mig[, paste0(names_est, "_tot.y")]
# correlation
mydata_new_demig_1 <- data.frame(
mig_measure = factor(c(rep("count", 14), rep("log", 14), rep("prop", 14),
rep("emig", 14), rep("immig", 14), rep("net", 14)),
levels = c("count", "log", "prop", "emig", "immig", "net")),
adjusted = factor(c(rep("raw", 3), rep("open", 5), rep("closed", 6)),
levels = c("raw", "open", "closed")),
method = factor(c("drop", "reverse", "mig. rate",
"drop", "reverse", "mig. rate", "mini.", "OT",
"drop", "reverse", "mig. rate", "mini.", "OT", "PB"),
levels = c("drop", "reverse", "mig. rate", "mini.", "OT", "PB")),
values = c(
as.numeric(cor(compare_mig[, "Y"],
compare_mig[, names_est[names_est != "Y"]])),
as.numeric(cor(log(1 + compare_mig[, "Y"]),
log(1 + compare_mig[, names_est[names_est != "Y"]])) ),
as.numeric(cor(compare_mig[, "Y_tot"],
compare_mig[, paste0(names_est[names_est != "Y"], "_tot")])),
as.numeric(cor(compare_emig_2[, "Y"],
compare_emig_2[, names_est[names_est != "Y"]])),
as.numeric(cor(compare_imig_2[, "Y"],
compare_imig_2[, names_est[names_est != "Y"]])),
as.numeric(cor(net_mig[, "Y"],
net_mig[, names_est[names_est != "Y"]]))
)
)
mydata_new_demig_1$data <- "DEMIG C2C"- DEMIG (Source: https://www.migrationinstitute.org/publications/wp-88-14)
# Demig Total outflows
demig_total <- read.csv2("data/Total Outflows-Tableau 1.csv")
demig_total[, 11:ncol(demig_total)] <- sapply(demig_total[, 11:ncol(demig_total)],
as.numeric)
emig <- demig_total %>%
filter(Coverage == "Total") %>%
select(5, 11:ncol(demig_total)) %>%
rename(Country = 1) # %>% filter(Country %in% countries_of_interset_demig)
emig <- emig %>%
pivot_longer(cols = 2:ncol(emig),
names_to = "year", values_to = "Y") %>%
mutate(year = as.numeric(substr(year, 2, 5))) %>%
mutate(weight = ifelse(year %in% c(1990, 1995, 2000, 2005, 2010, 2015, 2020), 0.5, 1)) %>%
filter(!is.na(Y))
emig_agg <- data.frame(country = character(), period = character(), Y = numeric())
for(p in 1990:2015) {
emig_agg <- rbind(emig_agg,
emig %>%
filter(year %in% p:(p+5)) %>%
group_by(Country) %>%
summarise(Y = sum(Y * weight),
weight = sum(weight)) %>%
mutate(Y = 5 * Y / weight,
period = paste0(p, "-", p+5)) %>%
filter(weight > 4.5)
)
}
# Note : Here, we include additional countries beyond those used in Abel and Cohen to increase the number of observations. To ensure data reliability, we require that each country has at least 5 observations within a given period.
### merge with the estimates
emig_agg <- merge(emig_agg,
all_estimates[all_estimates$type == "tot", ] %>%
group_by(origin, year) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE)),
by.x = c('Country', 'period'), by.y = c("origin", "year"))
# Demig Total inflows
demig_total <- read.csv2("data/Total Inflows-Tableau 1.csv")
demig_total[, 11:ncol(demig_total)] <- sapply(demig_total[, 11:ncol(demig_total)],
as.numeric)
immig <- demig_total %>%
filter(Coverage == "Total") %>%
select(5, 11:ncol(demig_total)) %>%
rename(Country = 1) # %>% filter(Country %in% countries_of_interset_demig)
immig <- immig %>%
pivot_longer(cols = 2:ncol(immig),
names_to = "year", values_to = "Y") %>%
mutate(year = as.numeric(substr(year, 2, 5))) %>%
mutate(weight = ifelse(year %in% c(1990, 1995, 2000, 2005, 2010, 2015, 2020), 0.5, 1)) %>%
filter(!is.na(Y))
immig_agg <- data.frame(country = character(), period = character(), Y = numeric())
for(p in 1990:2015) {
immig_agg <- rbind(immig_agg,
immig %>%
filter(year %in% p:(p+5)) %>%
group_by(Country) %>%
summarise(Y = sum(Y * weight),
weight = sum(weight)) %>%
mutate(Y = 5 * Y / weight,
period = paste0(p, "-", p+5)) %>%
filter(weight > 4.5)
)
}
### merge with the estimates
immig_agg <- merge(immig_agg,
all_estimates[all_estimates$type == "tot", ] %>%
group_by(dest, year) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE)),
by.x = c('Country', 'period'), by.y = c("dest", "year"))
# net aggr
net_agg_temp <- merge(immig_agg, emig_agg, by = c("Country", "period"))
net_agg <- net_agg_temp[, c(1, 2)]
net_agg[, names_est] <- net_agg_temp[, paste0(names_est, ".x")] -
net_agg_temp[, paste0(names_est, ".y")]
# merge with pop
immig_agg <- merge(immig_agg, temp_pop,
by.x = c("Country", "period"), by.y = c("iso3", "period"))
immig_agg[, names_est] <- immig_agg[, names_est] / immig_agg[, "pop"]
emig_agg <- merge(emig_agg, temp_pop,
by.x = c("Country", "period"), by.y = c("iso3", "period"))
emig_agg[, names_est] <- emig_agg[, names_est] / emig_agg[, "pop"]
#################################
# plot
# correlation
mydata_new_demig_2 <- data.frame(
mig_measure = factor(c(rep("emig", 14), rep("immig", 14), rep("net", 14)),
levels = c("emig", "immig", "net")),
adjusted = factor(c(rep("raw", 3), rep("open", 5), rep("closed", 6)),
levels = c("raw", "open", "closed")),
method = factor(c("drop", "reverse", "mig. rate",
"drop", "reverse", "mig. rate", "mini.", "OT",
"drop", "reverse", "mig. rate", "mini.", "OT", "PB"),
levels = c("drop", "reverse", "mig. rate", "mini.", "OT", "PB")),
values = c(
as.numeric(cor(emig_agg[, "Y"],
emig_agg[, names_est[names_est != "Y"]])),
as.numeric(cor(immig_agg[, "Y"],
immig_agg[, names_est[names_est != "Y"]])),
as.numeric(cor(net_agg[, "Y"],
net_agg[, names_est[names_est != "Y"]]))
)
)
mydata_new_demig_2$data <- "DEMIG TOTAL"- United Nations data basis (Source : https://www.kaggle.com/datasets/databeginner/un-migrationflow-2015):
name_files <- dir("data/archive-2/")
countries_of_interset_un <- countrycode(
substr(name_files, 1, nchar(name_files)-5), origin = "country.name", "iso3c")
un_flow_agg <- data.frame(origin = character(),
dest = character(),
period = character(),
Y = numeric())
for(k in 1:length(name_files)) {
nom_country <- substr(name_files[k], 1, nchar(name_files[k])-5)
my_path <- paste0("data/archive-2/", name_files[k])
un_flow <- readxl::read_xlsx(path = my_path, sheet = 2, skip = 20)
un_flow$origin <- un_flow$OdName
un_flow$dest <- un_flow$OdName
un_flow[un_flow$Type == "Emigrants", "origin"] <- nom_country
un_flow[un_flow$Type == "Immigrants", "dest"] <- nom_country
un_flow_long <- un_flow %>%
select(-c(2:9))
un_flow_long[, 2:(ncol(un_flow_long) - 2)] <- sapply(
un_flow_long[, 2:(ncol(un_flow_long) - 2)], as.numeric)
un_flow_long <- pivot_longer(un_flow_long, cols = 2:(ncol(un_flow_long) - 2),
names_to = "year", values_to = "Y")
un_flow_long$Y <- as.numeric(un_flow_long$Y)
# drop NA
un_flow_long <- un_flow_long[!is.na(un_flow_long$Y), ]
un_flow_long$year <- as.numeric(un_flow_long$year)
# aggregate by period
un_flow_long$weight <-
ifelse(un_flow_long$year %in% c(1990, 1995, 2000, 2005, 2010, 2015, 2020), 0.5, 1)
for(p in 1990:2015) {
un_flow_agg <- rbind(un_flow_agg,
un_flow_long %>%
filter(year %in% p:(p+5)) %>%
group_by(origin, dest, Type) %>%
summarise(Y = sum(Y * weight),
weight = sum(weight)) %>%
mutate(Y = 5 * Y / weight) %>%
mutate(period = paste0(p, "-", p+5))
)
}
}
un_flow_agg$origin <- countrycode::countrycode(un_flow_agg$origin,
origin = "country.name", "iso3c")
un_flow_agg$dest <- countrycode::countrycode(un_flow_agg$dest,
origin = "country.name", "iso3c")#### Merge with our estimates
compare_mig_un <- merge(all_estimates[all_estimates$type == "tot", ],
un_flow_agg[, c("origin", "dest", "period", "Y", "Type")],
by.x = c("origin", "dest", "year"),
by.y = c("origin", "dest", "period"))
compare_mig_un <- compare_mig_un[!is.na(compare_mig_un$Y), ]
# compute proportion
# emigrants
compare_mig_em <- compare_mig_un %>%
filter(Type == "Emigrants")
compare_mig_em_agg <- compare_mig_em %>%
group_by(origin, year) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE), n = n())
colnames(compare_mig_em_agg)[-c(1, 2)] <-
paste0(colnames(compare_mig_em_agg[, -c(1, 2)]), "_tot")
# we avoid to divide by 0
compare_mig_em_agg[compare_mig_em_agg == 0] <- 1
compare_mig_em <- merge(compare_mig_em, compare_mig_em_agg,
by = c("origin", "year"))
compare_mig_em[, paste0(names_est, "_prop")] <- compare_mig_em[, names_est] /
compare_mig_em[, paste0(names_est, "_tot")]
# Immigrants
compare_mig_im <- compare_mig_un %>%
filter(Type == "Immigrants")
compare_mig_im_agg <- compare_mig_im %>%
group_by(dest, year) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE), n = n())
colnames(compare_mig_im_agg)[-c(1, 2)] <-
paste0(colnames(compare_mig_im_agg[, -c(1, 2)]), "_tot")
# we avoid to divide by 0
compare_mig_im_agg[compare_mig_im_agg == 0] <- 1
compare_mig_im <- merge(compare_mig_im, compare_mig_im_agg,
by = c("dest", "year"))
compare_mig_im[, paste0(names_est, "_prop")] <- compare_mig_im[, names_est] /
compare_mig_im[, paste0(names_est, "_tot")]
# combine
compare_mig_un <- rbind(compare_mig_im, compare_mig_em)
# emigration / immigration
# emigration
compare_emig_2 <- merge(compare_mig_em_agg, temp_pop,
by.x = c("origin", "year"), by.y = c("iso3", "period"))
compare_emig_2[, names_est] <- compare_emig_2[, paste0(names_est, "_tot")] /
compare_emig_2[, "pop"]
# immigration
compare_imig_2 <- merge(compare_mig_im_agg, temp_pop,
by.x = c("dest", "year"), by.y = c("iso3", "period"))
compare_imig_2[, names_est] <- compare_imig_2[, paste0(names_est, "_tot")] /
compare_imig_2[, "pop"]
# net
net_agg_temp <- merge(compare_imig_2, compare_emig_2,
by.x = c("dest", "year"), by.y = c("origin", "year"))
net_mig <- net_agg_temp[, c(1, 2)]
net_mig[, names_est] <- net_agg_temp[, paste0(names_est, "_tot.x")] -
net_agg_temp[, paste0(names_est, "_tot.y")]
# correlation
mydata_un <- data.frame(
mig_measure = factor(c(rep("count", 14), rep("log", 14), rep("prop", 14),
rep("emig", 14), rep("immig", 14), rep("net", 14)),
levels = c("count", "log", "prop", "emig", "immig", "net")),
adjusted = factor(c(rep("raw", 3), rep("open", 5), rep("closed", 6)),
levels = c("raw", "open", "closed")),
method = factor(c("drop", "reverse", "mig. rate",
"drop", "reverse", "mig. rate", "mini.", "OT",
"drop", "reverse", "mig. rate", "mini.", "OT", "PB"),
levels = c("drop", "reverse", "mig. rate", "mini.", "OT", "PB")),
values = c(
as.numeric(cor(compare_mig_un[, "Y"],
compare_mig_un[, names_est[names_est != "Y"]])),
as.numeric(cor(log(1 + compare_mig_un[, "Y"]),
log(1 + compare_mig_un[, names_est[names_est != "Y"]])) ),
as.numeric(cor(compare_mig_un[, "Y_prop"],
compare_mig_un[, paste0(names_est[names_est != "Y"], "_prop")])),
as.numeric(cor(compare_emig_2[, "Y"],
compare_emig_2[, names_est[names_est != "Y"]])),
as.numeric(cor(compare_imig_2[, "Y"],
compare_imig_2[, names_est[names_est != "Y"]])),
as.numeric(cor(net_mig[, "Y"],
net_mig[, names_est[names_est != "Y"]]))
)
)
mydata_un$data <- "UN DESA"- WPP net (source : https://population.un.org/wpp/)
net_agg_wpp <- data.frame(country = character(), period = character(),
Y = numeric())
wpp2024_temp <- wpp2024 %>%
filter(!is.na(`Location code`), !is.na(Year),
!is.na(`Net Number of Migrants (thousands)`)) %>%
mutate(Year = as.numeric(Year))
for(p in 1990:2015) {
net_agg_wpp <- rbind(net_agg_wpp,
wpp2024_temp %>%
select(Year, `Location code`, `Net Number of Migrants (thousands)`) %>%
rename(year = 1, Country = 2, Y = 3) %>%
mutate(Y = as.numeric(Y)) %>%
mutate(weight = ifelse(year %in% c(1990, 1995, 2000, 2005, 2010, 2015, 2020), 0.5, 1)) %>%
filter(year %in% p:(p+5)) %>%
group_by(Country) %>%
summarise(Y = sum(Y * weight),
weight = sum(weight)) %>%
mutate(Y = 5 * Y / weight) %>%
mutate(period = paste0(p, "-", p+5))
)
}
net_agg_wpp$Country <- countrycode::countrycode(net_agg_wpp$Country,
origin = "un", "iso3c")
net_agg_wpp <- net_agg_wpp[!is.na(net_agg_wpp$Country), ]
# emigration
compare_emig_all <- all_estimates %>%
group_by(origin, year) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE))
# immigration
compare_imig_all <- all_estimates %>%
group_by(dest, year) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE))
# net
net <- compare_imig_all
net[, -c(1, 2)] <- compare_imig_all[, -c(1, 2)] - compare_emig_all[, -c(1, 2)]
# net
net_mig <- merge(net, net_agg_wpp, by.x = c("dest", "year"),
by.y = c("Country", "period"))
# correlation
mydata_un_tot <- data.frame(
mig_measure = factor(c(rep("net", 14)),
levels = c("count", "log", "prop", "emig", "immig", "net")),
adjusted = factor(c(rep("raw", 3), rep("open", 5), rep("closed", 6)),
levels = c("raw", "open", "closed")),
method = factor(c("drop", "reverse", "mig. rate",
"drop", "reverse", "mig. rate", "mini.", "OT",
"drop", "reverse", "mig. rate", "mini.", "OT", "PB"),
levels = c("drop", "reverse", "mig. rate", "mini.", "OT", "PB")),
values = c(
as.numeric(cor(net_mig[, "Y"],
net_mig[, names_est[names_est != "Y"]]))
)
)
mydata_un_tot$data <- "UN WPP"We plot the correlations between estimated migration flows during five-year periods from 1990–1995 to 2015–2020 from the different estimation methods with constructed five-year equivalent reported migration flows from four collections of migration statistics (figure 4 in the article).
mydata_new <- rbind(mydata_new_demig_1, mydata_new_demig_2, mydata_un, mydata_un_tot)
theme_heat <- theme_classic() +
theme(axis.line = element_blank(),
axis.ticks = element_blank())
# basic plot
levels(mydata_new$method)[4] <- "IPFP"
mydata_new$method <- factor(mydata_new$method,
levels = c("drop", "reverse", "mig. rate", "IPFP", "PB", "OT"))
plot <- ggplot(mydata_new, aes(x = mig_measure, y = method)) +
geom_tile(aes(fill = values), color = "white") +
theme(aspect.ratio = 1) +
facet_grid(vars(adjusted), vars(data), scales = "free", space = "free") + theme_heat
# plot with text overlay and viridis color palette
plot + geom_text(aes(label = round(values, 2)),
color = "white") +
scale_fill_gradientn(colors = rev(hcl.colors(20, "RdYlGn"))) +
# scale_fill_viridis(option = "viridis", limits=c(-1, 1), direction = -1) +
# formatting
ggtitle("", subtitle = "") +
labs(caption = "") +
xlab("Migration Measure") +
ylab("Estimation Method") +
theme(plot.title = element_text(face = "bold")) +
theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
theme(plot.caption = element_text(color = "grey68"))7 Vizualisation
We aim to reproduce the flow map displayed in Figure 5 of the article. As a first step, we aggregate individual countries into broader regions to simplify the visualisation.
# list of regions
levels_S <- c(
"N.Europe", "W.Europe", "E.Europe", "S.Europe",
"N.Africa", "S.-S.Africa",
"W.Asia", "S.Asia", "S.-E.Asia",
"C.Asia", "E.Asia",
"Aust. & New Z.", "Pacific Isl.",
"N.America", "L.America & Carib.")
labels_S <- c(
"Northern Europe", "Western Europe", "Eastern Europe", "Southern Europe",
"Northern Africa", "Sub-Saharan Africa",
"Western Asia", "Southern Asia", "South-Eastern Asia",
"Central Asia", "Eastern Asia",
"Australia and New Zealand", "Pacific Islands",
"Northern America", "Latin America and the Caribbean")
# prepare the table
tab_iso_reg <- st_drop_geometry(GeoDATA)[, c("ISO3","SREGION" )]
tab_iso_reg$SREGION <- gsub("Micronesia", "Pacific Islands", tab_iso_reg$SREGION)
tab_iso_reg$SREGION <- gsub("Polynesia", "Pacific Islands", tab_iso_reg$SREGION)
tab_iso_reg$SREGION <- gsub("Melanesia", "Pacific Islands", tab_iso_reg$SREGION)
tab_iso_reg$SREGION <- factor(tab_iso_reg$SREGION,
levels = labels_S , labels = levels_S)
q4 <- qualitative_hcl(length(levels_S), palette = "Dark 3")
names(q4) <- levels_S
our_estimates_reg <- all_estimates
our_estimates_reg <- merge(our_estimates_reg, tab_iso_reg,
by.x = "origin", by.y = "ISO3")
our_estimates_reg <- merge(our_estimates_reg, tab_iso_reg,
by.x = "dest", by.y = "ISO3")
our_estimates_reg <- aggregate(our_estimates_reg[,
!(names(our_estimates_reg) %in% c("dest", "origin", "year",
"type", "SREGION.x", "SREGION.y"))],
by = list(origin = our_estimates_reg$SREGION.x,
dest = our_estimates_reg$SREGION.y,
year = our_estimates_reg$year,
type = our_estimates_reg$type),
FUN = sum)
## prepare the spatial data
GeoDATA$SREGION <- gsub("Micronesia", "Pacific Islands", GeoDATA$SREGION)
GeoDATA$SREGION <- gsub("Polynesia", "Pacific Islands", GeoDATA$SREGION)
GeoDATA$SREGION <- gsub("Melanesia", "Pacific Islands", GeoDATA$SREGION)
countries_regions <- GeoDATA %>%
rmapshaper::ms_simplify(, keep = 0.05, keep_shapes = T)
contours_map <- aggregate(countries_regions[, 1],
by = list(S = countries_regions$SREGION),
FUN = length)
contours_map <- st_transform(contours_map, "+proj=moll")
contours_map$S <- factor(contours_map$S, levels = labels_S , labels = levels_S)
xy_sf <- st_centroid(contours_map)## Warning: st_centroid assumes attributes are constant over geometries
We plot the estimated migration flows between 15 geographic regions for 2020–2024 (top) and 2015–2020 (bottom), using two methods: Optimal Transport with Open Stocks (left) and Closed Stock-Adjusted Tables (right). In each regional panel, the left bar indicates total outgoing flows, the right bar total incoming flows, and the bottom bar intra-regional flows (i.e., movements within the same region). The widths of the bars and flow lines are proportional to the estimated flow volumes. Flow directions are indicated by arrows. A distinct color is assigned to each region for clarity (figure 5 in the article).
We print the Top 5 flows:
our_estimates_reg %>%
filter(type == "tot", year == "2020-2024") %>%
arrange(-hat_transport_open) %>%
select(origin, dest, hat_transport_open)## origin dest hat_transport_open
## 1 S.-S.Africa S.-S.Africa 5.384988e+06
## 2 L.America & Carib. L.America & Carib. 4.889411e+06
## 3 E.Europe E.Europe 4.745321e+06
## 4 L.America & Carib. N.America 4.179611e+06
## 5 W.Asia W.Asia 4.151850e+06
## 6 S.Asia W.Asia 3.510625e+06
## 7 E.Europe W.Europe 3.106401e+06
## 8 S.Asia S.Asia 2.884433e+06
## 9 N.America L.America & Carib. 1.660064e+06
## 10 L.America & Carib. S.Europe 1.637726e+06
## 11 S.-E.Asia S.-E.Asia 1.617606e+06
## 12 S.Asia N.America 1.575409e+06
## 13 E.Europe N.Europe 1.341954e+06
## 14 W.Asia S.Asia 1.214883e+06
## 15 S.-S.Africa N.Africa 1.208336e+06
## 16 E.Asia E.Asia 1.187709e+06
## 17 N.Africa S.-S.Africa 1.186504e+06
## 18 S.Europe W.Europe 1.147237e+06
## 19 E.Asia N.America 1.065879e+06
## 20 W.Asia W.Europe 1.060285e+06
## 21 N.Africa W.Asia 1.036101e+06
## 22 S.-E.Asia E.Asia 1.028880e+06
## 23 S.-E.Asia N.America 9.471597e+05
## 24 W.Europe W.Europe 9.114677e+05
## 25 S.-E.Asia S.Asia 8.735426e+05
## 26 E.Europe C.Asia 8.514212e+05
## 27 S.Europe S.Europe 7.843656e+05
## 28 E.Europe S.Europe 7.843128e+05
## 29 W.Europe S.Europe 7.789228e+05
## 30 W.Europe E.Europe 7.063854e+05
## 31 S.Asia N.Europe 6.921713e+05
## 32 E.Europe W.Asia 6.915851e+05
## 33 S.-S.Africa N.America 6.627748e+05
## 34 C.Asia E.Europe 6.584929e+05
## 35 S.Asia W.Europe 6.484828e+05
## 36 E.Asia S.-E.Asia 6.227857e+05
## 37 S.Europe L.America & Carib. 5.931691e+05
## 38 N.Africa N.Africa 5.792274e+05
## 39 N.Africa W.Europe 5.589046e+05
## 40 W.Asia N.Africa 5.575032e+05
## 41 S.-S.Africa W.Europe 5.504430e+05
## 42 N.America E.Asia 5.329746e+05
## 43 S.Asia Aust. & New Z. 5.322479e+05
## 44 W.Europe W.Asia 5.231031e+05
## 45 W.Asia N.America 4.715824e+05
## 46 S.-E.Asia W.Asia 4.686545e+05
## 47 N.Africa S.Europe 4.353119e+05
## 48 N.Europe E.Europe 4.316829e+05
## 49 N.America S.Asia 4.283272e+05
## 50 N.Europe N.Europe 4.242456e+05
## 51 N.America S.-E.Asia 4.147934e+05
## 52 S.Asia S.-E.Asia 4.100378e+05
## 53 S.Europe E.Europe 4.006911e+05
## 54 N.America N.America 3.956955e+05
## 55 S.Europe N.Europe 3.833559e+05
## 56 N.America S.Europe 3.357171e+05
## 57 S.-S.Africa S.Europe 3.354415e+05
## 58 W.Europe N.America 3.286733e+05
## 59 E.Europe N.America 3.270411e+05
## 60 S.-S.Africa N.Europe 3.245381e+05
## 61 N.America W.Europe 3.200002e+05
## 62 W.Asia E.Europe 3.175194e+05
## 63 S.-E.Asia Aust. & New Z. 3.101774e+05
## 64 N.America N.Europe 2.923538e+05
## 65 W.Asia N.Europe 2.824559e+05
## 66 N.America W.Asia 2.782782e+05
## 67 W.Europe N.Europe 2.769579e+05
## 68 W.Europe N.Africa 2.764979e+05
## 69 S.Europe N.America 2.749456e+05
## 70 N.Europe N.America 2.720795e+05
## 71 S.Asia S.Europe 2.704205e+05
## 72 C.Asia C.Asia 2.462192e+05
## 73 S.Asia E.Asia 2.460123e+05
## 74 N.Europe S.Asia 2.430851e+05
## 75 E.Asia Aust. & New Z. 2.399519e+05
## 76 W.Europe S.-S.Africa 2.386226e+05
## 77 W.Asia S.-E.Asia 2.308891e+05
## 78 N.Europe S.Europe 2.242427e+05
## 79 L.America & Carib. W.Europe 2.186931e+05
## 80 N.Europe W.Europe 2.145278e+05
## 81 N.America E.Europe 2.119422e+05
## 82 S.-S.Africa W.Asia 2.031771e+05
## 83 N.America S.-S.Africa 1.887820e+05
## 84 N.Europe W.Asia 1.780253e+05
## 85 Aust. & New Z. Aust. & New Z. 1.762060e+05
## 86 Aust. & New Z. N.Europe 1.675260e+05
## 87 E.Asia S.Asia 1.644161e+05
## 88 W.Europe L.America & Carib. 1.572570e+05
## 89 W.Asia S.-S.Africa 1.496993e+05
## 90 N.Europe Aust. & New Z. 1.489464e+05
## 91 N.Africa N.America 1.464527e+05
## 92 N.Europe S.-S.Africa 1.456657e+05
## 93 S.Europe N.Africa 1.429322e+05
## 94 W.Asia S.Europe 1.428820e+05
## 95 S.Europe S.-S.Africa 1.388033e+05
## 96 Aust. & New Z. S.Asia 1.362197e+05
## 97 E.Asia N.Europe 1.327047e+05
## 98 C.Asia W.Asia 1.326941e+05
## 99 Aust. & New Z. S.-E.Asia 1.322907e+05
## 100 W.Europe S.Asia 1.320542e+05
## 101 Aust. & New Z. E.Asia 1.293559e+05
## 102 S.-E.Asia N.Europe 1.269636e+05
## 103 E.Asia W.Europe 1.211520e+05
## 104 S.-E.Asia W.Europe 1.187812e+05
## 105 S.-S.Africa Aust. & New Z. 1.145146e+05
## 106 E.Asia L.America & Carib. 1.137026e+05
## 107 W.Asia Aust. & New Z. 1.092101e+05
## 108 C.Asia W.Europe 1.064207e+05
## 109 N.America Aust. & New Z. 1.061660e+05
## 110 N.Europe E.Asia 1.027053e+05
## 111 Aust. & New Z. N.America 1.000472e+05
## 112 S.Europe W.Asia 9.574134e+04
## 113 E.Asia S.Europe 9.359576e+04
## 114 L.America & Carib. N.Europe 8.900777e+04
## 115 Pacific Isl. Aust. & New Z. 8.745180e+04
## 116 L.America & Carib. Aust. & New Z. 8.716165e+04
## 117 Aust. & New Z. S.Europe 8.307036e+04
## 118 L.America & Carib. E.Asia 8.037154e+04
## 119 S.Europe S.Asia 7.862111e+04
## 120 N.Europe S.-E.Asia 7.599423e+04
## 121 Aust. & New Z. W.Asia 7.235577e+04
## 122 W.Europe S.-E.Asia 6.867551e+04
## 123 W.Europe E.Asia 6.568132e+04
## 124 E.Europe E.Asia 6.510409e+04
## 125 E.Asia C.Asia 6.269131e+04
## 126 W.Europe C.Asia 6.009568e+04
## 127 Aust. & New Z. W.Europe 5.970898e+04
## 128 N.America N.Africa 5.544645e+04
## 129 S.Europe E.Asia 5.302400e+04
## 130 Aust. & New Z. S.-S.Africa 5.174285e+04
## 131 N.Europe L.America & Carib. 5.065986e+04
## 132 S.-E.Asia S.Europe 4.814307e+04
## 133 W.Asia C.Asia 4.663067e+04
## 134 S.Europe Aust. & New Z. 4.560164e+04
## 135 C.Asia E.Asia 4.200813e+04
## 136 Aust. & New Z. Pacific Isl. 3.698156e+04
## 137 W.Europe Aust. & New Z. 3.656749e+04
## 138 E.Asia W.Asia 3.576682e+04
## 139 W.Asia E.Asia 3.409928e+04
## 140 S.-S.Africa S.Asia 3.373963e+04
## 141 L.America & Carib. W.Asia 3.127242e+04
## 142 Aust. & New Z. L.America & Carib. 3.126985e+04
## 143 E.Asia E.Europe 3.122529e+04
## 144 Aust. & New Z. E.Europe 3.114563e+04
## 145 S.-S.Africa E.Asia 2.970395e+04
## 146 E.Asia N.Africa 2.938473e+04
## 147 N.Africa N.Europe 2.800036e+04
## 148 S.Asia S.-S.Africa 2.796370e+04
## 149 L.America & Carib. S.-S.Africa 2.690402e+04
## 150 W.Asia L.America & Carib. 2.606560e+04
## 151 E.Europe Aust. & New Z. 2.564542e+04
## 152 S.Europe S.-E.Asia 2.453136e+04
## 153 S.Asia L.America & Carib. 2.376523e+04
## 154 S.-S.Africa L.America & Carib. 2.146731e+04
## 155 E.Asia S.-S.Africa 1.888761e+04
## 156 E.Europe L.America & Carib. 1.773742e+04
## 157 L.America & Carib. S.Asia 1.767812e+04
## 158 Pacific Isl. Pacific Isl. 1.737919e+04
## 159 S.-E.Asia E.Europe 1.701897e+04
## 160 N.Europe N.Africa 1.567569e+04
## 161 E.Europe S.-E.Asia 1.486879e+04
## 162 S.Asia E.Europe 1.486812e+04
## 163 S.-E.Asia L.America & Carib. 1.410239e+04
## 164 L.America & Carib. E.Europe 1.391161e+04
## 165 C.Asia S.Europe 1.312123e+04
## 166 N.Africa E.Europe 1.279487e+04
## 167 W.Europe Pacific Isl. 1.171330e+04
## 168 C.Asia N.America 1.099887e+04
## 169 N.Africa Aust. & New Z. 1.073568e+04
## 170 C.Asia N.Europe 1.049499e+04
## 171 S.-E.Asia Pacific Isl. 1.029719e+04
## 172 Aust. & New Z. N.Africa 8.904415e+03
## 173 N.America Pacific Isl. 8.018936e+03
## 174 S.-E.Asia N.Africa 7.827838e+03
## 175 E.Europe N.Africa 7.686108e+03
## 176 Pacific Isl. N.America 7.378765e+03
## 177 Pacific Isl. W.Europe 7.228576e+03
## 178 E.Europe S.Asia 7.071988e+03
## 179 N.America C.Asia 6.901666e+03
## 180 S.Europe C.Asia 6.195245e+03
## 181 S.Asia C.Asia 6.142153e+03
## 182 N.Europe C.Asia 6.116274e+03
## 183 Pacific Isl. S.-E.Asia 6.080855e+03
## 184 L.America & Carib. S.-E.Asia 5.778068e+03
## 185 N.Africa S.-E.Asia 5.525892e+03
## 186 S.-S.Africa E.Europe 5.297978e+03
## 187 S.Asia N.Africa 4.994083e+03
## 188 N.Africa E.Asia 4.352607e+03
## 189 C.Asia S.Asia 4.275659e+03
## 190 N.Africa L.America & Carib. 3.665812e+03
## 191 S.-E.Asia S.-S.Africa 3.642291e+03
## 192 E.Europe S.-S.Africa 3.424063e+03
## 193 E.Asia Pacific Isl. 3.287185e+03
## 194 S.-S.Africa S.-E.Asia 3.248232e+03
## 195 N.Africa S.Asia 3.177567e+03
## 196 C.Asia Aust. & New Z. 2.484058e+03
## 197 L.America & Carib. N.Africa 2.267427e+03
## 198 Pacific Isl. E.Asia 2.111584e+03
## 199 Aust. & New Z. C.Asia 1.533915e+03
## 200 Pacific Isl. S.-S.Africa 1.415070e+03
## 201 S.Asia Pacific Isl. 1.141347e+03
## 202 Pacific Isl. W.Asia 1.092094e+03
## 203 Pacific Isl. S.Europe 1.068567e+03
## 204 Pacific Isl. N.Europe 1.023920e+03
## 205 N.Europe Pacific Isl. 9.381446e+02
## 206 C.Asia N.Africa 9.149418e+02
## 207 S.Europe Pacific Isl. 8.431060e+02
## 208 C.Asia S.-E.Asia 8.166794e+02
## 209 W.Asia Pacific Isl. 8.142289e+02
## 210 S.-E.Asia C.Asia 7.363523e+02
## 211 Pacific Isl. L.America & Carib. 7.130293e+02
## 212 Pacific Isl. S.Asia 6.965870e+02
## 213 L.America & Carib. Pacific Isl. 6.666860e+02
## 214 N.Africa C.Asia 5.983857e+02
## 215 C.Asia L.America & Carib. 5.538382e+02
## 216 S.-S.Africa Pacific Isl. 5.229448e+02
## 217 C.Asia S.-S.Africa 4.461749e+02
## 218 N.Africa Pacific Isl. 4.239438e+02
## 219 L.America & Carib. C.Asia 3.992451e+02
## 220 S.-S.Africa C.Asia 3.667942e+02
## 221 Pacific Isl. N.Africa 2.680665e+02
## 222 Pacific Isl. E.Europe 1.628252e+02
## 223 E.Europe Pacific Isl. 1.078203e+02
## 224 Pacific Isl. C.Asia 8.760604e+00
## 225 C.Asia Pacific Isl. 6.390744e+00
our_estimates_reg %>%
filter(type == "tot", year == "2020-2024") %>%
arrange(-hat_transport_closed) %>%
select(origin, dest, hat_transport_closed)## origin dest hat_transport_closed
## 1 S.-S.Africa S.-S.Africa 6.314107e+06
## 2 S.Asia W.Asia 5.964250e+06
## 3 W.Asia W.Asia 5.118946e+06
## 4 S.Asia S.Asia 4.595392e+06
## 5 E.Europe E.Europe 4.479709e+06
## 6 L.America & Carib. L.America & Carib. 4.294090e+06
## 7 S.Asia N.America 3.323670e+06
## 8 L.America & Carib. N.America 3.042364e+06
## 9 E.Europe W.Europe 2.639400e+06
## 10 N.America L.America & Carib. 1.688754e+06
## 11 E.Asia N.America 1.536951e+06
## 12 N.Africa S.-S.Africa 1.505139e+06
## 13 S.Asia N.Europe 1.494741e+06
## 14 S.-E.Asia S.-E.Asia 1.402861e+06
## 15 E.Asia E.Asia 1.384241e+06
## 16 E.Europe N.Europe 1.189691e+06
## 17 S.Europe W.Europe 1.147887e+06
## 18 W.Asia S.Asia 1.128018e+06
## 19 S.Asia S.-E.Asia 1.107459e+06
## 20 N.Africa W.Asia 1.087042e+06
## 21 S.-E.Asia N.America 1.085360e+06
## 22 L.America & Carib. S.Europe 1.035952e+06
## 23 W.Europe W.Europe 8.944307e+05
## 24 S.Asia W.Europe 8.483262e+05
## 25 S.-E.Asia E.Asia 8.126334e+05
## 26 W.Europe S.Europe 8.096065e+05
## 27 S.Asia Aust. & New Z. 8.088701e+05
## 28 W.Asia W.Europe 7.990885e+05
## 29 E.Europe S.Europe 7.455122e+05
## 30 S.Europe S.Europe 7.389751e+05
## 31 W.Asia N.Africa 7.231490e+05
## 32 S.Europe L.America & Carib. 7.066793e+05
## 33 S.-S.Africa N.Africa 6.784538e+05
## 34 S.-S.Africa N.America 6.779026e+05
## 35 W.Europe E.Europe 6.637121e+05
## 36 E.Asia S.-E.Asia 6.244366e+05
## 37 N.Africa N.Africa 6.179118e+05
## 38 E.Europe C.Asia 6.116092e+05
## 39 S.Asia S.Europe 5.802087e+05
## 40 E.Europe W.Asia 5.768194e+05
## 41 S.Europe E.Europe 5.733054e+05
## 42 W.Europe W.Asia 5.345884e+05
## 43 W.Asia N.America 5.291450e+05
## 44 C.Asia E.Europe 5.176284e+05
## 45 N.America E.Asia 5.092695e+05
## 46 W.Asia S.-E.Asia 4.971752e+05
## 47 S.-S.Africa W.Europe 4.918552e+05
## 48 N.Europe E.Europe 4.465578e+05
## 49 N.Europe N.Europe 4.227781e+05
## 50 S.-E.Asia S.Asia 4.224717e+05
## 51 N.America N.America 4.089598e+05
## 52 N.Africa W.Europe 4.041943e+05
## 53 N.America S.-E.Asia 3.999571e+05
## 54 S.Asia E.Asia 3.886821e+05
## 55 N.America S.Asia 3.842875e+05
## 56 N.Europe N.America 3.719556e+05
## 57 S.Europe N.Europe 3.612364e+05
## 58 S.-E.Asia W.Asia 3.451616e+05
## 59 W.Asia E.Europe 3.441296e+05
## 60 S.Europe N.America 3.377107e+05
## 61 W.Europe N.America 3.267881e+05
## 62 N.America S.Europe 3.200367e+05
## 63 N.America W.Europe 3.126155e+05
## 64 N.Europe S.-S.Africa 3.079969e+05
## 65 N.Africa S.Europe 2.892222e+05
## 66 W.Asia S.-S.Africa 2.845474e+05
## 67 S.-S.Africa S.Europe 2.723961e+05
## 68 N.America N.Europe 2.715975e+05
## 69 N.America W.Asia 2.672790e+05
## 70 W.Europe N.Africa 2.651488e+05
## 71 S.-S.Africa N.Europe 2.463437e+05
## 72 L.America & Carib. W.Europe 2.426301e+05
## 73 W.Europe N.Europe 2.416843e+05
## 74 N.America S.-S.Africa 2.416790e+05
## 75 W.Europe S.-S.Africa 2.407206e+05
## 76 E.Asia Aust. & New Z. 2.332566e+05
## 77 N.Europe W.Europe 2.308488e+05
## 78 S.-E.Asia Aust. & New Z. 2.248594e+05
## 79 W.Asia N.Europe 2.248291e+05
## 80 N.Europe S.Europe 2.165097e+05
## 81 E.Europe N.America 2.126428e+05
## 82 N.America E.Europe 2.090140e+05
## 83 Aust. & New Z. N.Europe 2.088138e+05
## 84 N.Europe S.Asia 2.075593e+05
## 85 N.Europe W.Asia 2.033158e+05
## 86 Aust. & New Z. S.-S.Africa 1.911174e+05
## 87 S.-S.Africa W.Asia 1.881868e+05
## 88 Aust. & New Z. Aust. & New Z. 1.843978e+05
## 89 S.Europe W.Asia 1.835839e+05
## 90 C.Asia C.Asia 1.824143e+05
## 91 Aust. & New Z. S.Europe 1.703848e+05
## 92 W.Europe L.America & Carib. 1.688760e+05
## 93 E.Asia W.Europe 1.669091e+05
## 94 W.Asia S.Europe 1.636120e+05
## 95 S.Europe S.-S.Africa 1.538364e+05
## 96 N.Africa N.America 1.521394e+05
## 97 Aust. & New Z. N.America 1.479706e+05
## 98 E.Asia L.America & Carib. 1.392352e+05
## 99 S.Europe N.Africa 1.378817e+05
## 100 Aust. & New Z. W.Europe 1.321426e+05
## 101 E.Asia S.Europe 1.318012e+05
## 102 Aust. & New Z. S.-E.Asia 1.301597e+05
## 103 S.Europe Aust. & New Z. 1.254318e+05
## 104 Aust. & New Z. E.Asia 1.248169e+05
## 105 S.-E.Asia N.Europe 1.245934e+05
## 106 Aust. & New Z. S.Asia 1.236664e+05
## 107 S.-E.Asia W.Europe 1.235252e+05
## 108 W.Europe S.Asia 1.227911e+05
## 109 N.Europe Aust. & New Z. 1.217727e+05
## 110 Aust. & New Z. W.Asia 1.203147e+05
## 111 W.Asia Aust. & New Z. 1.176183e+05
## 112 C.Asia W.Asia 1.164094e+05
## 113 L.America & Carib. E.Asia 1.141379e+05
## 114 E.Asia N.Europe 1.108687e+05
## 115 C.Asia W.Europe 1.097447e+05
## 116 S.Asia S.-S.Africa 9.555264e+04
## 117 N.America Aust. & New Z. 9.273747e+04
## 118 E.Asia S.Asia 9.038489e+04
## 119 E.Asia C.Asia 8.498340e+04
## 120 Pacific Isl. Aust. & New Z. 8.226811e+04
## 121 L.America & Carib. N.Europe 8.193139e+04
## 122 W.Europe C.Asia 7.678613e+04
## 123 N.Europe S.-E.Asia 7.353965e+04
## 124 S.Europe S.Asia 7.239940e+04
## 125 L.America & Carib. Aust. & New Z. 7.046328e+04
## 126 W.Europe Aust. & New Z. 7.024376e+04
## 127 S.-S.Africa Aust. & New Z. 6.943217e+04
## 128 N.Europe L.America & Carib. 6.661898e+04
## 129 W.Europe S.-E.Asia 6.367205e+04
## 130 Aust. & New Z. E.Europe 6.358107e+04
## 131 W.Europe E.Asia 6.156998e+04
## 132 N.Europe E.Asia 5.992926e+04
## 133 Aust. & New Z. L.America & Carib. 5.755456e+04
## 134 W.Asia C.Asia 5.423298e+04
## 135 S.-E.Asia S.Europe 5.181294e+04
## 136 E.Europe E.Asia 5.167469e+04
## 137 W.Asia E.Asia 5.096929e+04
## 138 S.Europe E.Asia 5.040697e+04
## 139 N.America N.Africa 4.960943e+04
## 140 Aust. & New Z. Pacific Isl. 4.867816e+04
## 141 L.America & Carib. W.Asia 4.034295e+04
## 142 L.America & Carib. S.-S.Africa 3.979507e+04
## 143 E.Asia E.Europe 3.889047e+04
## 144 E.Asia W.Asia 3.601480e+04
## 145 C.Asia E.Asia 3.531135e+04
## 146 S.Asia L.America & Carib. 3.485178e+04
## 147 E.Europe Aust. & New Z. 3.377098e+04
## 148 W.Asia L.America & Carib. 3.178517e+04
## 149 N.Africa N.Europe 3.144338e+04
## 150 E.Asia S.-S.Africa 3.074586e+04
## 151 E.Asia N.Africa 3.029126e+04
## 152 S.Asia E.Europe 2.624360e+04
## 153 Pacific Isl. Pacific Isl. 2.580481e+04
## 154 S.-S.Africa E.Asia 2.483994e+04
## 155 S.Europe S.-E.Asia 2.387793e+04
## 156 Pacific Isl. N.America 2.148355e+04
## 157 N.Africa E.Europe 2.140447e+04
## 158 S.-S.Africa L.America & Carib. 2.087021e+04
## 159 S.Europe C.Asia 1.816705e+04
## 160 E.Europe L.America & Carib. 1.758072e+04
## 161 S.-E.Asia E.Europe 1.721750e+04
## 162 C.Asia N.Europe 1.670001e+04
## 163 L.America & Carib. E.Europe 1.532220e+04
## 164 C.Asia S.Europe 1.519357e+04
## 165 S.-E.Asia L.America & Carib. 1.487694e+04
## 166 E.Europe S.-E.Asia 1.482096e+04
## 167 N.Europe N.Africa 1.407386e+04
## 168 S.-S.Africa S.Asia 1.393807e+04
## 169 Pacific Isl. W.Europe 1.291592e+04
## 170 N.Africa Aust. & New Z. 1.290945e+04
## 171 L.America & Carib. S.Asia 1.168875e+04
## 172 S.Asia C.Asia 1.158063e+04
## 173 W.Europe Pacific Isl. 1.059121e+04
## 174 N.America C.Asia 1.016261e+04
## 175 Pacific Isl. S.-E.Asia 9.871727e+03
## 176 N.America Pacific Isl. 9.539155e+03
## 177 C.Asia N.America 9.375839e+03
## 178 S.-E.Asia Pacific Isl. 9.173941e+03
## 179 Aust. & New Z. N.Africa 8.744198e+03
## 180 S.Asia N.Africa 8.658569e+03
## 181 E.Europe N.Africa 8.544951e+03
## 182 N.Europe C.Asia 7.367784e+03
## 183 S.-E.Asia N.Africa 7.191908e+03
## 184 S.-E.Asia S.-S.Africa 6.550741e+03
## 185 E.Europe S.-S.Africa 6.419902e+03
## 186 N.Africa S.-E.Asia 6.216503e+03
## 187 L.America & Carib. S.-E.Asia 6.015591e+03
## 188 S.-S.Africa E.Europe 5.729058e+03
## 189 E.Europe S.Asia 5.401474e+03
## 190 Pacific Isl. E.Asia 5.315576e+03
## 191 N.Africa E.Asia 4.426212e+03
## 192 E.Asia Pacific Isl. 4.297922e+03
## 193 N.Africa L.America & Carib. 3.833487e+03
## 194 C.Asia S.Asia 3.737000e+03
## 195 S.Asia Pacific Isl. 3.080222e+03
## 196 Aust. & New Z. C.Asia 2.913208e+03
## 197 L.America & Carib. N.Africa 2.910539e+03
## 198 S.-S.Africa S.-E.Asia 2.870051e+03
## 199 Pacific Isl. L.America & Carib. 2.811897e+03
## 200 Pacific Isl. S.-S.Africa 2.718508e+03
## 201 N.Africa S.Asia 2.463411e+03
## 202 C.Asia Aust. & New Z. 2.412342e+03
## 203 Pacific Isl. S.Europe 1.933068e+03
## 204 Pacific Isl. N.Europe 1.830673e+03
## 205 Pacific Isl. W.Asia 1.813712e+03
## 206 N.Africa C.Asia 1.642483e+03
## 207 N.Europe Pacific Isl. 1.530213e+03
## 208 W.Asia Pacific Isl. 1.428666e+03
## 209 L.America & Carib. Pacific Isl. 1.312939e+03
## 210 S.-E.Asia C.Asia 1.047915e+03
## 211 S.Europe Pacific Isl. 1.010922e+03
## 212 C.Asia N.Africa 9.971986e+02
## 213 C.Asia L.America & Carib. 8.258977e+02
## 214 C.Asia S.-E.Asia 7.631307e+02
## 215 Pacific Isl. S.Asia 7.397136e+02
## 216 L.America & Carib. C.Asia 6.043795e+02
## 217 S.-S.Africa Pacific Isl. 5.778444e+02
## 218 S.-S.Africa C.Asia 5.371896e+02
## 219 Pacific Isl. N.Africa 5.023947e+02
## 220 C.Asia S.-S.Africa 4.969010e+02
## 221 Pacific Isl. E.Europe 4.378835e+02
## 222 N.Africa Pacific Isl. 3.198668e+02
## 223 E.Europe Pacific Isl. 1.346757e+02
## 224 Pacific Isl. C.Asia 3.006633e+01
## 225 C.Asia Pacific Isl. 6.248329e+00
We print the total outflows (to get the percentages):
our_estimates_reg %>%
filter(type == "tot", year == "2020-2024") %>%
group_by(origin)%>%
summarise(across(where(is.numeric), sum, na.rm = TRUE)) %>%
select(origin, hat_transport_open, hat_transport_closed)## # A tibble: 15 × 3
## origin hat_transport_open hat_transport_closed
## <fct> <dbl> <dbl>
## 1 N.Europe 2534591. 2752354.
## 2 W.Europe 4572675. 4551210.
## 3 E.Europe 11989681. 10593733.
## 4 S.Europe 4170058. 4632390.
## 5 N.Africa 4011777. 4140308.
## 6 S.-S.Africa 8878559. 9008040.
## 7 W.Asia 8796369. 10068674.
## 8 S.Asia 10848715. 19291566.
## 9 S.-E.Asia 5593533. 4649337.
## 10 C.Asia 1229948. 1012016.
## 11 E.Asia 3923139. 4643308.
## 12 Aust. & New Z. 1218359. 1715256.
## 13 Pacific Isl. 134080. 170478.
## 14 N.America 5235461. 5175499.
## 15 L.America & Carib. 11280859. 8999562.
We print the differences between the two periods:
data.frame(
origin = our_estimates_reg[our_estimates_reg$year %in% c("2020-2024") &
our_estimates_reg$type == "tot" , "origin"],
dest = our_estimates_reg[our_estimates_reg$year %in% c("2020-2024") &
our_estimates_reg$type == "tot", "dest"],
year_0 = our_estimates_reg[our_estimates_reg$year %in% c("2015-2020") &
our_estimates_reg$type == "tot", "hat_transport_open"],
year_1 = our_estimates_reg[our_estimates_reg$year %in% c("2020-2024") &
our_estimates_reg$type == "tot", "hat_transport_open"],
diff = our_estimates_reg[our_estimates_reg$year %in% c("2020-2024") &
our_estimates_reg$type == "tot", "hat_transport_open"]
- our_estimates_reg[our_estimates_reg$year %in% c("2015-2020") &
our_estimates_reg$type == "tot", "hat_transport_open"]) %>%
arrange(-abs(diff)) %>%
head(5)## origin dest year_0 year_1 diff
## 1 E.Europe E.Europe 2657524 4745321 2087797
## 2 L.America & Carib. L.America & Carib. 6624379 4889411 -1734968
## 3 S.-S.Africa S.-S.Africa 7075061 5384988 -1690074
## 4 E.Europe W.Europe 1750962 3106401 1355439
## 5 W.Asia W.Asia 5496017 4151850 -1344167
data.frame(
origin = our_estimates_reg[our_estimates_reg$year %in% c("2020-2024") &
our_estimates_reg$type == "tot" , "origin"],
dest = our_estimates_reg[our_estimates_reg$year %in% c("2020-2024") &
our_estimates_reg$type == "tot", "dest"],
year_0 = our_estimates_reg[our_estimates_reg$year %in% c("2015-2020") &
our_estimates_reg$type == "tot", "hat_transport_closed"],
year_1 = our_estimates_reg[our_estimates_reg$year %in% c("2020-2024") &
our_estimates_reg$type == "tot", "hat_transport_closed"],
diff = our_estimates_reg[our_estimates_reg$year %in% c("2020-2024") &
our_estimates_reg$type == "tot", "hat_transport_closed"]
- our_estimates_reg[our_estimates_reg$year %in% c("2015-2020") &
our_estimates_reg$type == "tot", "hat_transport_closed"]) %>%
arrange(-abs(diff)) %>%
head(5)## origin dest year_0 year_1 diff
## 1 W.Asia W.Asia 7702389 5118946 -2583443
## 2 L.America & Carib. L.America & Carib. 6520121 4294090 -2226031
## 3 E.Europe E.Europe 2386834 4479709 2092875
## 4 S.-S.Africa S.-S.Africa 8283168 6314107 -1969061
## 5 S.Asia S.Asia 6291349 4595392 -1695957
7.1 Regional visualization of migration flows estimated by OT
We print the plots obtained in Appendix E.
7.1.1 Africa
Estimated intra-regional migration flows for Africa, derived from the Optimal Transport (OT) model applied to the open-adjusted bilateral stock tables. Results for the two most recent periods (2015–2020 and 2020–2024) are shown, along with the largest bilateral migration flows and the main countries of origin in terms of total outflows (Figure 15 in the article).
## origin dest hat_transport_open
## 1 SDN TCD 765183.6
## 2 SSD SDN 612507.7
## 3 SDN EGY 454824.6
## 4 BFA CIV 323426.5
## 5 MAR ESP 257855.4
## 6 ZWE ZAF 240794.5
## 7 SDN SSD 200981.4
## 8 CIV BFA 191557.7
## 9 SOM ETH 186999.2
## 10 SDN SAU 180243.0
## # A tibble: 10 × 2
## origin sum
## <chr> <dbl>
## 1 SDN 1898939.
## 2 SSD 842242.
## 3 EGY 740091.
## 4 MAR 538193.
## 5 COD 494017.
## 6 CIV 468872.
## 7 FRA 398186.
## 8 BFA 375370.
## 9 NGA 370278.
## 10 SOM 337431.
## origin dest hat_transport_open
## 1 SSD UGA 731282.8
## 2 SSD SDN 692711.6
## 3 BFA CIV 398864.1
## 4 ZWE ZAF 290171.1
## 5 MAR ESP 267932.9
## 6 COD UGA 262172.3
## 7 CIV BFA 237720.1
## 8 DZA FRA 219583.2
## 9 MAR FRA 201072.1
## 10 SDN SSD 169551.9
## # A tibble: 10 × 2
## origin sum
## <chr> <dbl>
## 1 SSD 1733943.
## 2 COD 701829.
## 3 EGY 647948.
## 4 MAR 637533.
## 5 CIV 582203.
## 6 SDN 576642.
## 7 FRA 460031.
## 8 BFA 456767.
## 9 NGA 454133.
## 10 ETH 365761.
7.1.2 America
Estimated intra-regional migration flows for Americas, derived from the Optimal Transport (OT) model applied to the open-adjusted bilateral stock tables. Results for the two most recent periods (2015–2020 and 2020–2024) are shown, along with the largest bilateral migration flows and the main countries of origin in terms of total outflows (Figure 16 in the article).
## origin dest hat_transport_open
## 1 VEN COL 1128516.0
## 2 MEX USA 902197.1
## 3 VEN PER 583090.6
## 4 USA MEX 392559.1
## 5 DOM USA 344079.9
## 6 VEN BRA 288171.1
## 7 GTM USA 287282.0
## 8 VEN USA 268492.7
## 9 SLV USA 251614.5
## 10 BRA USA 247866.5
## # A tibble: 10 × 2
## origin sum
## <chr> <dbl>
## 1 VEN 2762046.
## 2 USA 1757319.
## 3 MEX 959601.
## 4 COL 661940.
## 5 DOM 424521.
## 6 BRA 420725.
## 7 HTI 400062.
## 8 PER 339249.
## 9 HND 329445.
## 10 GTM 308461.
## origin dest hat_transport_open
## 1 VEN COL 1827381.8
## 2 VEN PER 992289.5
## 3 USA MEX 452174.5
## 4 VEN ECU 410036.6
## 5 VEN CHL 401279.9
## 6 PRI USA 372001.4
## 7 DOM USA 347529.2
## 8 GTM USA 344179.1
## 9 MEX USA 319941.8
## 10 VEN USA 313551.3
## # A tibble: 10 × 2
## origin sum
## <chr> <dbl>
## 1 VEN 4634746.
## 2 USA 2058487.
## 3 COL 616502.
## 4 HTI 447460.
## 5 DOM 442800.
## 6 BRA 393740.
## 7 CAN 393542.
## 8 PRI 390769.
## 9 GTM 364416.
## 10 MEX 357515.
7.1.3 Europe
Estimated intra-regional migration flows for Europe, derived from the Optimal Transport (OT) model applied to the open-adjusted bilateral stock tables. Results for the two most recent periods (2015–2020 and 2020–2024) are shown, along with the largest bilateral migration flows and the main countries of origin in terms of total outflows (Figure 17 in the article).
## origin dest hat_transport_open
## 1 UKR RUS 1475102.9
## 2 UKR DEU 1207043.6
## 3 UKR POL 954857.5
## 4 TUR SYR 614912.0
## 5 RUS UKR 520132.5
## 6 TUR DEU 440260.6
## 7 RUS KAZ 399508.5
## 8 UKR CZE 387203.8
## 9 RUS DEU 304178.5
## 10 SYR TUR 297189.3
## # A tibble: 10 × 2
## origin sum
## <chr> <dbl>
## 1 UKR 6195230.
## 2 RUS 2180118.
## 3 TUR 1749461.
## 4 DEU 1505546.
## 5 SYR 886380.
## 6 FRA 869216.
## 7 ROU 850269.
## 8 ITA 799171.
## 9 POL 766005.
## 10 GBR 716018.
## origin dest hat_transport_open
## 1 SYR TUR 1392664.7
## 2 RUS UKR 942839.6
## 3 UKR RUS 529310.4
## 4 RUS KAZ 512264.6
## 5 SYR DEU 402743.4
## 6 KAZ RUS 350639.6
## 7 LBN SYR 340052.5
## 8 RUS DEU 325211.9
## 9 IRQ TUR 316292.7
## 10 ROU GBR 308257.3
## # A tibble: 10 × 2
## origin sum
## <chr> <dbl>
## 1 RUS 2850652.
## 2 SYR 2653522.
## 3 DEU 1588907.
## 4 ROU 1066041.
## 5 FRA 1045839.
## 6 ITA 1043558.
## 7 UKR 1031081.
## 8 TUR 908631.
## 9 POL 805272.
## 10 IRQ 768101.
7.1.4 Asia
Estimated intra-regional migration flows for Asia, derived from the Optimal Transport (OT) model applied to the open-adjusted bilateral stock tables. Results for the two most recent periods (2015–2020 and 2020–2024) are shown, along with the largest bilateral migration flows and the main countries of origin in terms of total outflows (Figure 18 in the article).
## origin dest hat_transport_open
## 1 AFG IRN 1155572.3
## 2 IND ARE 542792.9
## 3 AFG PAK 471126.0
## 4 BGD SAU 403770.4
## 5 RUS KAZ 399508.5
## 6 MMR BGD 399199.8
## 7 YEM SAU 325514.2
## 8 CHN HKG 309340.3
## 9 PAK SAU 283519.0
## 10 MMR THA 274404.3
## # A tibble: 10 × 2
## origin sum
## <chr> <dbl>
## 1 IND 2343055.
## 2 AFG 1716520.
## 3 BGD 1352895.
## 4 PAK 1067125.
## 5 CHN 1012616.
## 6 MMR 967413.
## 7 RUS 926828.
## 8 SAU 775925.
## 9 MYS 769082.
## 10 IDN 645657.
## origin dest hat_transport_open
## 1 MMR BGD 673475.9
## 2 RUS KAZ 512264.6
## 3 CHN HKG 363765.1
## 4 KAZ RUS 350639.6
## 5 IND ARE 333536.7
## 6 MMR THA 316824.1
## 7 VNM JPN 309564.8
## 8 AFG IRN 302256.9
## 9 IND AUS 296265.4
## 10 BGD SAU 274428.7
## # A tibble: 10 × 2
## origin sum
## <chr> <dbl>
## 1 IND 2068037.
## 2 CHN 1458471.
## 3 MMR 1340303.
## 4 RUS 1127659.
## 5 BGD 1121910.
## 6 PAK 1027003.
## 7 SAU 944881.
## 8 MYS 735288.
## 9 AFG 626534.
## 10 VNM 626006.